npx skills add https://github.com/answerzhao/agent-skills --skill ASR此技能指导如何使用 z-ai-web-dev-sdk 包实现语音转文本 (ASR) 功能,能够将口语音频准确转录为文本。
技能位置 : {project_path}/skills/ASR
此技能位于您项目的上述路径中。
参考脚本 : 示例测试脚本位于 {技能位置}/scripts/ 目录中,用于快速测试和参考。请参阅 {技能位置}/scripts/asr.ts 获取工作示例。
语音转文本 (ASR - 自动语音识别) 允许您构建将音频文件中的口语转换为书面文本的应用程序,从而实现语音控制界面、转录服务和音频内容分析。
重要提示 : z-ai-web-dev-sdk 必须仅在后端代码中使用。切勿在客户端代码中使用。
z-ai-web-dev-sdk 包已安装。请按照以下示例所示导入。
对于简单的音频转录任务,您可以使用 z-ai CLI 而无需编写代码。这非常适合快速转录、测试音频文件或批处理。
# 转录音频文件
z-ai asr --file ./audio.wav
# 将转录保存到 JSON 文件
z-ai asr -f ./recording.mp3 -o transcript.json
# 转录并查看输出
z-ai asr --file ./interview.wav --output result.json
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
# 从 base64 编码的音频进行转录
z-ai asr --base64 "UklGRiQAAABXQVZFZm10..." -o result.json
# 使用短选项
z-ai asr -b "base64_encoded_audio_data" -o transcript.json
# 流式传输转录结果
z-ai asr -f ./audio.wav --stream
--file, -f <路径>: 必需 (如果不使用 --base64) - 音频文件路径--base64, -b <base64>: 必需 (如果不使用 --file) - Base64 编码的音频--output, -o <路径>: 可选 - 输出文件路径 (JSON 格式)--stream: 可选 - 流式传输转录输出ASR 服务支持多种音频格式,包括:
使用 CLI 适用于:
使用 SDK 适用于:
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function transcribeAudio(audioFilePath) {
const zai = await ZAI.create();
// 读取音频文件并转换为 base64
const audioFile = fs.readFileSync(audioFilePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
return response.text;
}
// 用法
const transcription = await transcribeAudio('./audio.wav');
console.log('转录结果:', transcription);
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function transcribeBatch(audioFilePaths) {
const zai = await ZAI.create();
const results = [];
for (const filePath of audioFilePaths) {
try {
const audioFile = fs.readFileSync(filePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
results.push({
file: filePath,
success: true,
transcription: response.text
});
} catch (error) {
results.push({
file: filePath,
success: false,
error: error.message
});
}
}
return results;
}
// 用法
const files = ['./interview1.wav', './interview2.wav', './interview3.wav'];
const transcriptions = await transcribeBatch(files);
transcriptions.forEach(result => {
if (result.success) {
console.log(`${result.file}: ${result.transcription}`);
} else {
console.error(`${result.file}: 错误 - ${result.error}`);
}
});
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
import path from 'path';
async function transcribeWithMetadata(audioFilePath) {
const zai = await ZAI.create();
// 获取文件元数据
const stats = fs.statSync(audioFilePath);
const audioFile = fs.readFileSync(audioFilePath);
const base64Audio = audioFile.toString('base64');
const startTime = Date.now();
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
const endTime = Date.now();
return {
filename: path.basename(audioFilePath),
filepath: audioFilePath,
fileSize: stats.size,
transcription: response.text,
wordCount: response.text.split(/\s+/).length,
processingTime: endTime - startTime,
timestamp: new Date().toISOString()
};
}
// 用法
const result = await transcribeWithMetadata('./meeting_recording.wav');
console.log('转录详情:', JSON.stringify(result, null, 2));
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
class ASRService {
constructor() {
this.zai = null;
this.transcriptionCache = new Map();
}
async initialize() {
this.zai = await ZAI.create();
}
generateCacheKey(audioBuffer) {
const crypto = require('crypto');
return crypto.createHash('md5').update(audioBuffer).digest('hex');
}
async transcribe(audioFilePath, useCache = true) {
const audioBuffer = fs.readFileSync(audioFilePath);
const cacheKey = this.generateCacheKey(audioBuffer);
// 检查缓存
if (useCache && this.transcriptionCache.has(cacheKey)) {
return {
transcription: this.transcriptionCache.get(cacheKey),
cached: true
};
}
// 转录音频
const base64Audio = audioBuffer.toString('base64');
const response = await this.zai.audio.asr.create({
file_base64: base64Audio
});
// 缓存结果
if (useCache) {
this.transcriptionCache.set(cacheKey, response.text);
}
return {
transcription: response.text,
cached: false
};
}
clearCache() {
this.transcriptionCache.clear();
}
getCacheSize() {
return this.transcriptionCache.size;
}
}
// 用法
const asrService = new ASRService();
await asrService.initialize();
const result1 = await asrService.transcribe('./audio.wav');
console.log('第一次调用 (未缓存):', result1);
const result2 = await asrService.transcribe('./audio.wav');
console.log('第二次调用 (已缓存):', result2);
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
import path from 'path';
async function transcribeDirectory(directoryPath, outputJsonPath) {
const zai = await ZAI.create();
// 获取所有音频文件
const files = fs.readdirSync(directoryPath);
const audioFiles = files.filter(file =>
/\.(wav|mp3|m4a|flac|ogg)$/i.test(file)
);
const results = {
directory: directoryPath,
totalFiles: audioFiles.length,
processedAt: new Date().toISOString(),
transcriptions: []
};
for (const filename of audioFiles) {
const filePath = path.join(directoryPath, filename);
try {
const audioFile = fs.readFileSync(filePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
results.transcriptions.push({
filename: filename,
success: true,
text: response.text,
wordCount: response.text.split(/\s+/).length
});
console.log(`✓ 已转录: ${filename}`);
} catch (error) {
results.transcriptions.push({
filename: filename,
success: false,
error: error.message
});
console.error(`✗ 失败: ${filename} - ${error.message}`);
}
}
// 将结果保存到 JSON
fs.writeFileSync(
outputJsonPath,
JSON.stringify(results, null, 2)
);
return results;
}
// 用法
const results = await transcribeDirectory(
'./audio-recordings',
'./transcriptions.json'
);
console.log(`\n已处理 ${results.totalFiles} 个文件`);
console.log(`成功: ${results.transcriptions.filter(t => t.success).length}`);
console.log(`失败: ${results.transcriptions.filter(t => !t.success).length}`);
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function transcribeAnyFormat(audioFilePath) {
// 支持的格式: WAV, MP3, M4A, FLAC, OGG 等
const validExtensions = ['.wav', '.mp3', '.m4a', '.flac', '.ogg'];
const ext = audioFilePath.toLowerCase().substring(audioFilePath.lastIndexOf('.'));
if (!validExtensions.includes(ext)) {
throw new Error(`不支持的音频格式: ${ext}`);
}
const zai = await ZAI.create();
const audioFile = fs.readFileSync(audioFilePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
return response.text;
}
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function safeTranscribe(audioFilePath) {
try {
// 验证文件是否存在
if (!fs.existsSync(audioFilePath)) {
throw new Error(`文件未找到: ${audioFilePath}`);
}
// 检查文件大小 (例如,限制为 100MB)
const stats = fs.statSync(audioFilePath);
const fileSizeMB = stats.size / (1024 * 1024);
if (fileSizeMB > 100) {
throw new Error(`文件过大: ${fileSizeMB.toFixed(2)}MB (最大 100MB)`);
}
// 转录
const zai = await ZAI.create();
const audioFile = fs.readFileSync(audioFilePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
if (!response.text || response.text.trim().length === 0) {
throw new Error('转录结果为空');
}
return {
success: true,
transcription: response.text,
filePath: audioFilePath,
fileSize: stats.size
};
} catch (error) {
console.error('转录错误:', error);
return {
success: false,
error: error.message,
filePath: audioFilePath
};
}
}
function cleanTranscription(text) {
// 移除多余空白字符
text = text.replace(/\s+/g, ' ').trim();
// 将句子的首字母大写
text = text.replace(/(^\w|[.!?]\s+\w)/g, match => match.toUpperCase());
// 移除填充词 (可选)
const fillers = ['um', 'uh', 'ah', 'like', 'you know'];
const fillerPattern = new RegExp(`\\b(${fillers.join('|')})\\b`, 'gi');
text = text.replace(fillerPattern, '').replace(/\s+/g, ' ');
return text;
}
async function transcribeAndClean(audioFilePath) {
const zai = await ZAI.create();
const audioFile = fs.readFileSync(audioFilePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
return {
raw: response.text,
cleaned: cleanTranscription(response.text)
};
}
import express from 'express';
import multer from 'multer';
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
const app = express();
const upload = multer({ dest: 'uploads/' });
let zaiInstance;
async function initZAI() {
zaiInstance = await ZAI.create();
}
app.post('/api/transcribe', upload.single('audio'), async (req, res) => {
try {
if (!req.file) {
return res.status(400).json({ error: '未提供音频文件' });
}
const audioFile = fs.readFileSync(req.file.path);
const base64Audio = audioFile.toString('base64');
const response = await zaiInstance.audio.asr.create({
file_base64: base64Audio
});
// 清理上传的文件
fs.unlinkSync(req.file.path);
res.json({
success: true,
transcription: response.text,
wordCount: response.text.split(/\s+/).length
});
} catch (error) {
// 出错时清理
if (req.file && fs.existsSync(req.file.path)) {
fs.unlinkSync(req.file.path);
}
res.status(500).json({
success: false,
error: error.message
});
}
});
initZAI().then(() => {
app.listen(3000, () => {
console.log('ASR API 运行在端口 3000');
});
});
问题 : "SDK 必须用于后端"
问题 : 转录结果为空或不正确
问题 : 大文件处理失败
问题 : 转录速度慢
问题 : 大文件出现内存错误
为了获得最佳转录结果:
每周安装量
68
代码仓库
GitHub 星标数
24
首次出现
2026年1月23日
安全审计
已安装于
opencode61
codex58
gemini-cli57
cursor55
github-copilot54
openclaw51
This skill guides the implementation of speech-to-text (ASR) functionality using the z-ai-web-dev-sdk package, enabling accurate transcription of spoken audio into text.
Skill Location : {project_path}/skills/ASR
this skill is located at above path in your project.
Reference Scripts : Example test scripts are available in the {Skill Location}/scripts/ directory for quick testing and reference. See {Skill Location}/scripts/asr.ts for a working example.
Speech-to-Text (ASR - Automatic Speech Recognition) allows you to build applications that convert spoken language in audio files into written text, enabling voice-controlled interfaces, transcription services, and audio content analysis.
IMPORTANT : z-ai-web-dev-sdk MUST be used in backend code only. Never use it in client-side code.
The z-ai-web-dev-sdk package is already installed. Import it as shown in the examples below.
For simple audio transcription tasks, you can use the z-ai CLI instead of writing code. This is ideal for quick transcriptions, testing audio files, or batch processing.
# Transcribe an audio file
z-ai asr --file ./audio.wav
# Save transcription to JSON file
z-ai asr -f ./recording.mp3 -o transcript.json
# Transcribe and view output
z-ai asr --file ./interview.wav --output result.json
# Transcribe from base64 encoded audio
z-ai asr --base64 "UklGRiQAAABXQVZFZm10..." -o result.json
# Using short option
z-ai asr -b "base64_encoded_audio_data" -o transcript.json
# Stream transcription results
z-ai asr -f ./audio.wav --stream
--file, -f <path>: Required (if not using --base64) - Audio file path--base64, -b <base64>: Required (if not using --file) - Base64 encoded audio--output, -o <path>: Optional - Output file path (JSON format)--stream: Optional - Stream the transcription outputThe ASR service supports various audio formats including:
Use CLI for:
Use SDK for:
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function transcribeAudio(audioFilePath) {
const zai = await ZAI.create();
// Read audio file and convert to base64
const audioFile = fs.readFileSync(audioFilePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
return response.text;
}
// Usage
const transcription = await transcribeAudio('./audio.wav');
console.log('Transcription:', transcription);
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function transcribeBatch(audioFilePaths) {
const zai = await ZAI.create();
const results = [];
for (const filePath of audioFilePaths) {
try {
const audioFile = fs.readFileSync(filePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
results.push({
file: filePath,
success: true,
transcription: response.text
});
} catch (error) {
results.push({
file: filePath,
success: false,
error: error.message
});
}
}
return results;
}
// Usage
const files = ['./interview1.wav', './interview2.wav', './interview3.wav'];
const transcriptions = await transcribeBatch(files);
transcriptions.forEach(result => {
if (result.success) {
console.log(`${result.file}: ${result.transcription}`);
} else {
console.error(`${result.file}: Error - ${result.error}`);
}
});
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
import path from 'path';
async function transcribeWithMetadata(audioFilePath) {
const zai = await ZAI.create();
// Get file metadata
const stats = fs.statSync(audioFilePath);
const audioFile = fs.readFileSync(audioFilePath);
const base64Audio = audioFile.toString('base64');
const startTime = Date.now();
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
const endTime = Date.now();
return {
filename: path.basename(audioFilePath),
filepath: audioFilePath,
fileSize: stats.size,
transcription: response.text,
wordCount: response.text.split(/\s+/).length,
processingTime: endTime - startTime,
timestamp: new Date().toISOString()
};
}
// Usage
const result = await transcribeWithMetadata('./meeting_recording.wav');
console.log('Transcription Details:', JSON.stringify(result, null, 2));
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
class ASRService {
constructor() {
this.zai = null;
this.transcriptionCache = new Map();
}
async initialize() {
this.zai = await ZAI.create();
}
generateCacheKey(audioBuffer) {
const crypto = require('crypto');
return crypto.createHash('md5').update(audioBuffer).digest('hex');
}
async transcribe(audioFilePath, useCache = true) {
const audioBuffer = fs.readFileSync(audioFilePath);
const cacheKey = this.generateCacheKey(audioBuffer);
// Check cache
if (useCache && this.transcriptionCache.has(cacheKey)) {
return {
transcription: this.transcriptionCache.get(cacheKey),
cached: true
};
}
// Transcribe audio
const base64Audio = audioBuffer.toString('base64');
const response = await this.zai.audio.asr.create({
file_base64: base64Audio
});
// Cache result
if (useCache) {
this.transcriptionCache.set(cacheKey, response.text);
}
return {
transcription: response.text,
cached: false
};
}
clearCache() {
this.transcriptionCache.clear();
}
getCacheSize() {
return this.transcriptionCache.size;
}
}
// Usage
const asrService = new ASRService();
await asrService.initialize();
const result1 = await asrService.transcribe('./audio.wav');
console.log('First call (not cached):', result1);
const result2 = await asrService.transcribe('./audio.wav');
console.log('Second call (cached):', result2);
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
import path from 'path';
async function transcribeDirectory(directoryPath, outputJsonPath) {
const zai = await ZAI.create();
// Get all audio files
const files = fs.readdirSync(directoryPath);
const audioFiles = files.filter(file =>
/\.(wav|mp3|m4a|flac|ogg)$/i.test(file)
);
const results = {
directory: directoryPath,
totalFiles: audioFiles.length,
processedAt: new Date().toISOString(),
transcriptions: []
};
for (const filename of audioFiles) {
const filePath = path.join(directoryPath, filename);
try {
const audioFile = fs.readFileSync(filePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
results.transcriptions.push({
filename: filename,
success: true,
text: response.text,
wordCount: response.text.split(/\s+/).length
});
console.log(`✓ Transcribed: ${filename}`);
} catch (error) {
results.transcriptions.push({
filename: filename,
success: false,
error: error.message
});
console.error(`✗ Failed: ${filename} - ${error.message}`);
}
}
// Save results to JSON
fs.writeFileSync(
outputJsonPath,
JSON.stringify(results, null, 2)
);
return results;
}
// Usage
const results = await transcribeDirectory(
'./audio-recordings',
'./transcriptions.json'
);
console.log(`\nProcessed ${results.totalFiles} files`);
console.log(`Successful: ${results.transcriptions.filter(t => t.success).length}`);
console.log(`Failed: ${results.transcriptions.filter(t => !t.success).length}`);
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function transcribeAnyFormat(audioFilePath) {
// Supported formats: WAV, MP3, M4A, FLAC, OGG, etc.
const validExtensions = ['.wav', '.mp3', '.m4a', '.flac', '.ogg'];
const ext = audioFilePath.toLowerCase().substring(audioFilePath.lastIndexOf('.'));
if (!validExtensions.includes(ext)) {
throw new Error(`Unsupported audio format: ${ext}`);
}
const zai = await ZAI.create();
const audioFile = fs.readFileSync(audioFilePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
return response.text;
}
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function safeTranscribe(audioFilePath) {
try {
// Validate file exists
if (!fs.existsSync(audioFilePath)) {
throw new Error(`File not found: ${audioFilePath}`);
}
// Check file size (e.g., limit to 100MB)
const stats = fs.statSync(audioFilePath);
const fileSizeMB = stats.size / (1024 * 1024);
if (fileSizeMB > 100) {
throw new Error(`File too large: ${fileSizeMB.toFixed(2)}MB (max 100MB)`);
}
// Transcribe
const zai = await ZAI.create();
const audioFile = fs.readFileSync(audioFilePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
if (!response.text || response.text.trim().length === 0) {
throw new Error('Empty transcription result');
}
return {
success: true,
transcription: response.text,
filePath: audioFilePath,
fileSize: stats.size
};
} catch (error) {
console.error('Transcription error:', error);
return {
success: false,
error: error.message,
filePath: audioFilePath
};
}
}
function cleanTranscription(text) {
// Remove excessive whitespace
text = text.replace(/\s+/g, ' ').trim();
// Capitalize first letter of sentences
text = text.replace(/(^\w|[.!?]\s+\w)/g, match => match.toUpperCase());
// Remove filler words (optional)
const fillers = ['um', 'uh', 'ah', 'like', 'you know'];
const fillerPattern = new RegExp(`\\b(${fillers.join('|')})\\b`, 'gi');
text = text.replace(fillerPattern, '').replace(/\s+/g, ' ');
return text;
}
async function transcribeAndClean(audioFilePath) {
const zai = await ZAI.create();
const audioFile = fs.readFileSync(audioFilePath);
const base64Audio = audioFile.toString('base64');
const response = await zai.audio.asr.create({
file_base64: base64Audio
});
return {
raw: response.text,
cleaned: cleanTranscription(response.text)
};
}
import express from 'express';
import multer from 'multer';
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
const app = express();
const upload = multer({ dest: 'uploads/' });
let zaiInstance;
async function initZAI() {
zaiInstance = await ZAI.create();
}
app.post('/api/transcribe', upload.single('audio'), async (req, res) => {
try {
if (!req.file) {
return res.status(400).json({ error: 'No audio file provided' });
}
const audioFile = fs.readFileSync(req.file.path);
const base64Audio = audioFile.toString('base64');
const response = await zaiInstance.audio.asr.create({
file_base64: base64Audio
});
// Clean up uploaded file
fs.unlinkSync(req.file.path);
res.json({
success: true,
transcription: response.text,
wordCount: response.text.split(/\s+/).length
});
} catch (error) {
// Clean up on error
if (req.file && fs.existsSync(req.file.path)) {
fs.unlinkSync(req.file.path);
}
res.status(500).json({
success: false,
error: error.message
});
}
});
initZAI().then(() => {
app.listen(3000, () => {
console.log('ASR API running on port 3000');
});
});
Issue : "SDK must be used in backend"
Issue : Empty or incorrect transcription
Issue : Large file processing fails
Issue : Slow transcription speed
Issue : Memory errors with large files
For best transcription results:
Weekly Installs
68
Repository
GitHub Stars
24
First Seen
Jan 23, 2026
Security Audits
Gen Agent Trust HubWarnSocketPassSnykWarn
Installed on
opencode61
codex58
gemini-cli57
cursor55
github-copilot54
openclaw51
AI Elements:基于shadcn/ui的AI原生应用组件库,快速构建对话界面
66,200 周安装