speech-recognition by dpearson2699/swift-ios-skills
npx skills add https://github.com/dpearson2699/swift-ios-skills --skill speech-recognition使用 Apple 的 Speech 框架将实时和预录制的音频转录为文本。涵盖 SFSpeechRecognizer (iOS 10+) 和新的 SpeechAnalyzer API (iOS 26+)。
SpeechAnalyzer 是 iOS 26 中引入的基于 Actor 的 API,用于在新项目中替代 SFSpeechRecognizer。它使用 Swift 并发,通过 AsyncSequence 获取结果,并支持通过 SpeechTranscriber 进行模块化分析。
import Speech
// 1. 创建转录器模块
guard let locale = SpeechTranscriber.supportedLocale(
equivalentTo: Locale.current
) else { return }
let transcriber = SpeechTranscriber(locale: locale, preset: .offlineTranscription)
// 2. 确保资源已安装
if let request = try await AssetInventory.assetInstallationRequest(
supporting: [transcriber]
) {
try await request.downloadAndInstall()
}
// 3. 创建输入流和分析器
let (inputSequence, inputBuilder) = AsyncStream.makeStream(of: AnalyzerInput.self)
let audioFormat = await SpeechAnalyzer.bestAvailableAudioFormat(
compatibleWith: [transcriber]
)
let analyzer = SpeechAnalyzer(modules: [transcriber])
// 4. 提供音频缓冲区(来自 AVAudioEngine 或文件)
Task {
// 追加转换为 audioFormat 的 PCM 缓冲区
let pcmBuffer: AVAudioPCMBuffer = // ... 你的音频缓冲区
inputBuilder.yield(AnalyzerInput(buffer: pcmBuffer))
inputBuilder.finish()
}
// 5. 消费结果
Task {
for try await result in transcriber.results {
let text = String(result.text.characters)
print(text)
}
}
// 6. 运行分析
let lastSampleTime = try await analyzer.analyzeSequence(inputSequence)
// 7. 完成
if let lastSampleTime {
try await analyzer.finalizeAndFinish(through: lastSampleTime)
} else {
try analyzer.cancelAndFinishNow()
}
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
let transcriber = SpeechTranscriber(locale: locale, preset: .offlineTranscription)
let audioFile = try AVAudioFile(forReading: fileURL)
let analyzer = SpeechAnalyzer(
inputAudioFile: audioFile, modules: [transcriber], finishAfterFile: true
)
for try await result in transcriber.results {
print(String(result.text.characters))
}
| 功能 | SFSpeechRecognizer | SpeechAnalyzer |
|---|---|---|
| 并发性 | 回调/委托 | async/await + AsyncSequence |
| 类型 | class | actor |
| 模块 | 整体式 | 可组合 (SpeechTranscriber, SpeechDetector) |
| 音频输入 | 对请求调用 append(_:) | AsyncStream<AnalyzerInput> |
| 可用性 | iOS 10+ | iOS 26+ |
| 设备端 | requiresOnDeviceRecognition | 通过 AssetInventory 基于资源 |
import Speech
// 默认区域设置(用户的当前语言)
let recognizer = SFSpeechRecognizer()
// 特定区域设置
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))
// 检查此区域设置的识别是否可用
guard let recognizer, recognizer.isAvailable else {
print("Speech recognition not available")
return
}
final class SpeechManager: NSObject, SFSpeechRecognizerDelegate {
private let recognizer = SFSpeechRecognizer()!
override init() {
super.init()
recognizer.delegate = self
}
func speechRecognizer(
_ speechRecognizer: SFSpeechRecognizer,
availabilityDidChange available: Bool
) {
// 更新 UI — 当不可用时禁用录音按钮
}
}
在开始实时转录之前,请求两者的语音识别和麦克风权限。将这些键添加到 Info.plist:
NSSpeechRecognitionUsageDescription
NSMicrophoneUsageDescription
import Speech import AVFoundation
func requestPermissions() async -> Bool { let speechStatus = await withCheckedContinuation { continuation in SFSpeechRecognizer.requestAuthorization { status in continuation.resume(returning: status) } } guard speechStatus == .authorized else { return false }
let micStatus: Bool
if #available(iOS 17, *) {
micStatus = await AVAudioApplication.requestRecordPermission()
} else {
micStatus = await withCheckedContinuation { continuation in
AVAudioSession.sharedInstance().requestRecordPermission { granted in
continuation.resume(returning: granted)
}
}
}
return micStatus
}
标准模式:AVAudioEngine 捕获麦克风音频 → 缓冲区被追加到 SFSpeechAudioBufferRecognitionRequest → 结果流式传入。
import Speech
import AVFoundation
final class LiveTranscriber {
private let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
private let audioEngine = AVAudioEngine()
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
func startTranscribing() throws {
// 取消任何进行中的任务
recognitionTask?.cancel()
recognitionTask = nil
// 配置音频会话
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
// 创建请求
let request = SFSpeechAudioBufferRecognitionRequest()
request.shouldReportPartialResults = true
self.recognitionRequest = request
// 开始识别任务
recognitionTask = recognizer.recognitionTask(with: request) { result, error in
if let result {
let text = result.bestTranscription.formattedString
print("Transcription: \(text)")
if result.isFinal {
self.stopTranscribing()
}
}
if let error {
print("Recognition error: \(error)")
self.stopTranscribing()
}
}
// 安装音频监听器
let inputNode = audioEngine.inputNode
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) {
buffer, _ in
request.append(buffer)
}
audioEngine.prepare()
try audioEngine.start()
}
func stopTranscribing() {
audioEngine.stop()
audioEngine.inputNode.removeTap(onBus: 0)
recognitionRequest?.endAudio()
recognitionRequest = nil
recognitionTask?.cancel()
recognitionTask = nil
}
}
对磁盘上的音频文件使用 SFSpeechURLRecognitionRequest:
func transcribeFile(at url: URL) async throws -> String {
guard let recognizer = SFSpeechRecognizer(), recognizer.isAvailable else {
throw SpeechError.unavailable
}
let request = SFSpeechURLRecognitionRequest(url: url)
request.shouldReportPartialResults = false
return try await withCheckedThrowingContinuation { continuation in
recognizer.recognitionTask(with: request) { result, error in
if let error {
continuation.resume(throwing: error)
} else if let result, result.isFinal {
continuation.resume(
returning: result.bestTranscription.formattedString
)
}
}
}
}
设备端识别 (iOS 13+) 可离线工作,但支持的区域设置较少:
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
// 检查此区域设置是否支持设备端识别
if recognizer.supportsOnDeviceRecognition {
let request = SFSpeechAudioBufferRecognitionRequest()
request.requiresOnDeviceRecognition = true // 强制设备端识别
}
提示: 设备端识别避免了网络延迟和基于服务器的识别所施加的一分钟音频限制。然而,准确性可能较低,并且并非所有区域设置都受支持。在强制设备端模式之前,请检查
supportsOnDeviceRecognition。
let request = SFSpeechAudioBufferRecognitionRequest()
request.shouldReportPartialResults = true // 默认为 true
recognizer.recognitionTask(with: request) { result, error in
guard let result else { return }
if result.isFinal {
// 最终转录 — 识别完成
let final = result.bestTranscription.formattedString
} else {
// 部分结果 — 随着处理更多音频可能会改变
let partial = result.bestTranscription.formattedString
}
}
recognizer.recognitionTask(with: request) { result, error in
guard let result else { return }
// 最佳转录
let best = result.bestTranscription
// 所有替代方案(按置信度降序排序)
for transcription in result.transcriptions {
for segment in transcription.segments {
print("\(segment.substring): \(segment.confidence)")
}
}
}
let request = SFSpeechAudioBufferRecognitionRequest()
request.addsPunctuation = true
改进对特定领域术语的识别:
let request = SFSpeechAudioBufferRecognitionRequest()
request.contextualStrings = ["SwiftUI", "Xcode", "CloudKit"]
// ❌ 不要:仅为实时音频请求语音授权
SFSpeechRecognizer.requestAuthorization { status in
// 缺少麦克风权限 — 音频引擎将失败
self.startRecording()
}
// ✅ 要:在录音前请求两个权限
SFSpeechRecognizer.requestAuthorization { status in
guard status == .authorized else { return }
AVAudioSession.sharedInstance().requestRecordPermission { granted in
guard granted else { return }
self.startRecording()
}
}
// ❌ 不要:假设识别器在初始检查后保持可用
let recognizer = SFSpeechRecognizer()!
// 如果网络断开或区域设置更改,识别可能会失败
// ✅ 要:通过委托监控可用性
recognizer.delegate = self
func speechRecognizer(
_ speechRecognizer: SFSpeechRecognizer,
availabilityDidChange available: Bool
) {
recordButton.isEnabled = available
}
// ❌ 不要:识别完成后让音频引擎继续运行
recognizer.recognitionTask(with: request) { result, error in
if result?.isFinal == true {
// 音频引擎仍在运行,浪费资源和电量
}
}
// ✅ 要:清理所有音频资源
recognizer.recognitionTask(with: request) { result, error in
if result?.isFinal == true || error != nil {
self.audioEngine.stop()
self.audioEngine.inputNode.removeTap(onBus: 0)
self.recognitionRequest?.endAudio()
self.recognitionRequest = nil
}
}
// ❌ 不要:未检查支持就强制设备端识别
let request = SFSpeechAudioBufferRecognitionRequest()
request.requiresOnDeviceRecognition = true // 可能会静默失败
// ✅ 要:在要求设备端识别前检查支持
if recognizer.supportsOnDeviceRecognition {
request.requiresOnDeviceRecognition = true
} else {
// 回退到基于服务器的识别或通知用户
}
// ❌ 不要:开始一个长的连续识别会话
func startRecording() {
// 这将在约 60 秒后(基于服务器的)被切断
}
// ✅ 要:在接近限制时重新启动识别
func startRecording() {
// 使用计时器在限制前重新启动
recognitionTimer = Timer.scheduledTimer(withTimeInterval: 55, repeats: false) {
[weak self] _ in
self?.restartRecognition()
}
}
// ❌ 不要:在不取消前一个任务的情况下启动新任务
func startRecording() {
recognitionTask = recognizer.recognitionTask(with: request) { ... }
// 前一个任务仍在运行 — 未定义行为
}
// ✅ 要:在创建新任务前取消现有任务
func startRecording() {
recognitionTask?.cancel()
recognitionTask = nil
recognitionTask = recognizer.recognitionTask(with: request) { ... }
}
Info.plist 中包含 NSSpeechRecognitionUsageDescriptionInfo.plist 中包含 NSMicrophoneUsageDescription(如果使用实时音频)SFSpeechRecognizerDelegate 以处理 availabilityDidChangerecognitionRequest.endAudio()recognitionTasksupportsOnDeviceRecognitionisFinal) 结果分开处理SpeechAnalyzer 前安装 AssetInventory 资源SpeechTranscriber.supportedLocale(equivalentTo:)每周安装数
353
代码仓库
GitHub 星标数
269
首次出现
2026年3月8日
安全审计
安装于
codex350
kimi-cli347
amp347
cline347
github-copilot347
opencode347
Transcribe live and pre-recorded audio to text using Apple's Speech framework. Covers SFSpeechRecognizer (iOS 10+) and the new SpeechAnalyzer API (iOS 26+).
SpeechAnalyzer is an actor-based API introduced in iOS 26 that replaces SFSpeechRecognizer for new projects. It uses Swift concurrency, AsyncSequence for results, and supports modular analysis via SpeechTranscriber.
import Speech
// 1. Create a transcriber module
guard let locale = SpeechTranscriber.supportedLocale(
equivalentTo: Locale.current
) else { return }
let transcriber = SpeechTranscriber(locale: locale, preset: .offlineTranscription)
// 2. Ensure assets are installed
if let request = try await AssetInventory.assetInstallationRequest(
supporting: [transcriber]
) {
try await request.downloadAndInstall()
}
// 3. Create input stream and analyzer
let (inputSequence, inputBuilder) = AsyncStream.makeStream(of: AnalyzerInput.self)
let audioFormat = await SpeechAnalyzer.bestAvailableAudioFormat(
compatibleWith: [transcriber]
)
let analyzer = SpeechAnalyzer(modules: [transcriber])
// 4. Feed audio buffers (from AVAudioEngine or file)
Task {
// Append PCM buffers converted to audioFormat
let pcmBuffer: AVAudioPCMBuffer = // ... your audio buffer
inputBuilder.yield(AnalyzerInput(buffer: pcmBuffer))
inputBuilder.finish()
}
// 5. Consume results
Task {
for try await result in transcriber.results {
let text = String(result.text.characters)
print(text)
}
}
// 6. Run analysis
let lastSampleTime = try await analyzer.analyzeSequence(inputSequence)
// 7. Finalize
if let lastSampleTime {
try await analyzer.finalizeAndFinish(through: lastSampleTime)
} else {
try analyzer.cancelAndFinishNow()
}
let transcriber = SpeechTranscriber(locale: locale, preset: .offlineTranscription)
let audioFile = try AVAudioFile(forReading: fileURL)
let analyzer = SpeechAnalyzer(
inputAudioFile: audioFile, modules: [transcriber], finishAfterFile: true
)
for try await result in transcriber.results {
print(String(result.text.characters))
}
| Feature | SFSpeechRecognizer | SpeechAnalyzer |
|---|---|---|
| Concurrency | Callbacks/delegates | async/await + AsyncSequence |
| Type | class | actor |
| Modules | Monolithic | Composable (SpeechTranscriber, SpeechDetector) |
| Audio input | append(_:) on request | AsyncStream<AnalyzerInput> |
import Speech
// Default locale (user's current language)
let recognizer = SFSpeechRecognizer()
// Specific locale
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))
// Check if recognition is available for this locale
guard let recognizer, recognizer.isAvailable else {
print("Speech recognition not available")
return
}
final class SpeechManager: NSObject, SFSpeechRecognizerDelegate {
private let recognizer = SFSpeechRecognizer()!
override init() {
super.init()
recognizer.delegate = self
}
func speechRecognizer(
_ speechRecognizer: SFSpeechRecognizer,
availabilityDidChange available: Bool
) {
// Update UI — disable record button when unavailable
}
}
Request both speech recognition and microphone permissions before starting live transcription. Add these keys to Info.plist:
NSSpeechRecognitionUsageDescription
NSMicrophoneUsageDescription
import Speech import AVFoundation
func requestPermissions() async -> Bool { let speechStatus = await withCheckedContinuation { continuation in SFSpeechRecognizer.requestAuthorization { status in continuation.resume(returning: status) } } guard speechStatus == .authorized else { return false }
let micStatus: Bool
if #available(iOS 17, *) {
micStatus = await AVAudioApplication.requestRecordPermission()
} else {
micStatus = await withCheckedContinuation { continuation in
AVAudioSession.sharedInstance().requestRecordPermission { granted in
continuation.resume(returning: granted)
}
}
}
return micStatus
}
The standard pattern: AVAudioEngine captures microphone audio → buffers are appended to SFSpeechAudioBufferRecognitionRequest → results stream in.
import Speech
import AVFoundation
final class LiveTranscriber {
private let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
private let audioEngine = AVAudioEngine()
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
func startTranscribing() throws {
// Cancel any in-progress task
recognitionTask?.cancel()
recognitionTask = nil
// Configure audio session
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
// Create request
let request = SFSpeechAudioBufferRecognitionRequest()
request.shouldReportPartialResults = true
self.recognitionRequest = request
// Start recognition task
recognitionTask = recognizer.recognitionTask(with: request) { result, error in
if let result {
let text = result.bestTranscription.formattedString
print("Transcription: \(text)")
if result.isFinal {
self.stopTranscribing()
}
}
if let error {
print("Recognition error: \(error)")
self.stopTranscribing()
}
}
// Install audio tap
let inputNode = audioEngine.inputNode
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) {
buffer, _ in
request.append(buffer)
}
audioEngine.prepare()
try audioEngine.start()
}
func stopTranscribing() {
audioEngine.stop()
audioEngine.inputNode.removeTap(onBus: 0)
recognitionRequest?.endAudio()
recognitionRequest = nil
recognitionTask?.cancel()
recognitionTask = nil
}
}
Use SFSpeechURLRecognitionRequest for audio files on disk:
func transcribeFile(at url: URL) async throws -> String {
guard let recognizer = SFSpeechRecognizer(), recognizer.isAvailable else {
throw SpeechError.unavailable
}
let request = SFSpeechURLRecognitionRequest(url: url)
request.shouldReportPartialResults = false
return try await withCheckedThrowingContinuation { continuation in
recognizer.recognitionTask(with: request) { result, error in
if let error {
continuation.resume(throwing: error)
} else if let result, result.isFinal {
continuation.resume(
returning: result.bestTranscription.formattedString
)
}
}
}
}
On-device recognition (iOS 13+) works offline but supports fewer locales:
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
// Check if on-device is supported for this locale
if recognizer.supportsOnDeviceRecognition {
let request = SFSpeechAudioBufferRecognitionRequest()
request.requiresOnDeviceRecognition = true // Force on-device
}
Tip: On-device recognition avoids network latency and the one-minute audio limit imposed by server-based recognition. However, accuracy may be lower and not all locales are supported. Check
supportsOnDeviceRecognitionbefore forcing on-device mode.
let request = SFSpeechAudioBufferRecognitionRequest()
request.shouldReportPartialResults = true // default is true
recognizer.recognitionTask(with: request) { result, error in
guard let result else { return }
if result.isFinal {
// Final transcription — recognition is complete
let final = result.bestTranscription.formattedString
} else {
// Partial result — may change as more audio is processed
let partial = result.bestTranscription.formattedString
}
}
recognizer.recognitionTask(with: request) { result, error in
guard let result else { return }
// Best transcription
let best = result.bestTranscription
// All alternatives (sorted by confidence, descending)
for transcription in result.transcriptions {
for segment in transcription.segments {
print("\(segment.substring): \(segment.confidence)")
}
}
}
let request = SFSpeechAudioBufferRecognitionRequest()
request.addsPunctuation = true
Improve recognition of domain-specific terms:
let request = SFSpeechAudioBufferRecognitionRequest()
request.contextualStrings = ["SwiftUI", "Xcode", "CloudKit"]
// ❌ DON'T: Only request speech authorization for live audio
SFSpeechRecognizer.requestAuthorization { status in
// Missing microphone permission — audio engine will fail
self.startRecording()
}
// ✅ DO: Request both permissions before recording
SFSpeechRecognizer.requestAuthorization { status in
guard status == .authorized else { return }
AVAudioSession.sharedInstance().requestRecordPermission { granted in
guard granted else { return }
self.startRecording()
}
}
// ❌ DON'T: Assume recognizer stays available after initial check
let recognizer = SFSpeechRecognizer()!
// Recognition may fail if network drops or locale changes
// ✅ DO: Monitor availability via delegate
recognizer.delegate = self
func speechRecognizer(
_ speechRecognizer: SFSpeechRecognizer,
availabilityDidChange available: Bool
) {
recordButton.isEnabled = available
}
// ❌ DON'T: Leave audio engine running after recognition finishes
recognizer.recognitionTask(with: request) { result, error in
if result?.isFinal == true {
// Audio engine still running, wasting resources and battery
}
}
// ✅ DO: Clean up all audio resources
recognizer.recognitionTask(with: request) { result, error in
if result?.isFinal == true || error != nil {
self.audioEngine.stop()
self.audioEngine.inputNode.removeTap(onBus: 0)
self.recognitionRequest?.endAudio()
self.recognitionRequest = nil
}
}
// ❌ DON'T: Force on-device without checking support
let request = SFSpeechAudioBufferRecognitionRequest()
request.requiresOnDeviceRecognition = true // May silently fail
// ✅ DO: Check support before requiring on-device
if recognizer.supportsOnDeviceRecognition {
request.requiresOnDeviceRecognition = true
} else {
// Fall back to server-based or inform user
}
// ❌ DON'T: Start one long continuous recognition session
func startRecording() {
// This will be cut off after ~60 seconds (server-based)
}
// ✅ DO: Restart recognition when approaching the limit
func startRecording() {
// Use a timer to restart before the limit
recognitionTimer = Timer.scheduledTimer(withTimeInterval: 55, repeats: false) {
[weak self] _ in
self?.restartRecognition()
}
}
// ❌ DON'T: Start a new task without canceling the previous one
func startRecording() {
recognitionTask = recognizer.recognitionTask(with: request) { ... }
// Previous task is still running — undefined behavior
}
// ✅ DO: Cancel existing task before creating a new one
func startRecording() {
recognitionTask?.cancel()
recognitionTask = nil
recognitionTask = recognizer.recognitionTask(with: request) { ... }
}
NSSpeechRecognitionUsageDescription is in Info.plistNSMicrophoneUsageDescription is in Info.plist (if using live audio)SFSpeechRecognizerDelegate is set to handle availabilityDidChangerecognitionRequest.endAudio() is called when done recordingrecognitionTask is canceled before starting a new onesupportsOnDeviceRecognition is checked before requiring on-device modeisFinal) resultsWeekly Installs
353
Repository
GitHub Stars
269
First Seen
Mar 8, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
codex350
kimi-cli347
amp347
cline347
github-copilot347
opencode347
AI音乐生成工具 - 使用inference.sh CLI快速生成歌曲、配乐和背景音乐
7,400 周安装
| Availability | iOS 10+ | iOS 26+ |
| On-device | requiresOnDeviceRecognition | Asset-based via AssetInventory |
AssetInventory assets are installed before using SpeechAnalyzerSpeechTranscriber.supportedLocale(equivalentTo:) is checked