diff --git a/Sources/Scripta/MeetingRecorder.swift b/Sources/Scripta/MeetingRecorder.swift index 3856a3b..4da2e99 100644 --- a/Sources/Scripta/MeetingRecorder.swift +++ b/Sources/Scripta/MeetingRecorder.swift @@ -118,6 +118,7 @@ final class MeetingRecorder: NSObject, ObservableObject { } } + whisperEngine.language = recognitionLanguage.components(separatedBy: "-").first?.lowercased() ?? "en" whisperEngine.onTranscript = { [weak self] text in guard let self, self.state == .recording, !self.micMuted else { return } self.appendWhisperTranscript(text) @@ -309,6 +310,10 @@ final class MeetingRecorder: NSObject, ObservableObject { mplog("System audio: buffer #\(systemBufferCount) frames=\(pcm.frameLength) rate=\(pcm.format.sampleRate) ch=\(pcm.format.channelCount) fmt=\(pcm.format.commonFormat.rawValue)") } + // Write original quality (48 kHz) to the audio file. + writeSystemAudio(pcm) + + // Downsample to 16 kHz only for SFSpeech recognition. let buffer: AVAudioPCMBuffer if pcm.format.sampleRate == recognitionFormat.sampleRate && pcm.format.channelCount == recognitionFormat.channelCount && @@ -319,8 +324,6 @@ final class MeetingRecorder: NSObject, ObservableObject { buffer = converted } - writeSystemAudio(buffer) - if !systemRecognitionStarted && state == .recording { systemRecognitionStarted = true DispatchQueue.main.async { [weak self] in @@ -676,9 +679,9 @@ final class MeetingRecorder: NSObject, ObservableObject { private static let audioFileSettings: [String: Any] = [ AVFormatIDKey: kAudioFormatMPEG4AAC, - AVSampleRateKey: 16_000, + AVSampleRateKey: 48_000, AVNumberOfChannelsKey: 1, - AVEncoderBitRateKey: 48_000, + AVEncoderBitRateKey: 128_000, ] private func startAudioWriters() { @@ -728,7 +731,7 @@ final class MeetingRecorder: NSObject, ObservableObject { output.frameLength = outFrames guard let outPtr = output.floatChannelData?[0] else { return } - if abs(ratio - 1.0) < 0.001 && buffer.format.channelCount == 1 { + if abs(ratio - 1.0) < 0.001 { memcpy(outPtr, ch0, Int(frames) * MemoryLayout.size) } else { let srcCount = Int(frames) diff --git a/Sources/Scripta/SystemAudioCapture.swift b/Sources/Scripta/SystemAudioCapture.swift index f4f71a5..429e158 100644 --- a/Sources/Scripta/SystemAudioCapture.swift +++ b/Sources/Scripta/SystemAudioCapture.swift @@ -61,7 +61,7 @@ final class SystemAudioCapture: NSObject, SCStreamDelegate, SCStreamOutput { let config = SCStreamConfiguration() config.capturesAudio = true config.excludesCurrentProcessAudio = true - config.sampleRate = 16_000 + config.sampleRate = 48_000 config.channelCount = 1 config.queueDepth = 8 config.width = 2 diff --git a/Sources/Scripta/WhisperEngine.swift b/Sources/Scripta/WhisperEngine.swift index d8c694a..14831ba 100644 --- a/Sources/Scripta/WhisperEngine.swift +++ b/Sources/Scripta/WhisperEngine.swift @@ -21,6 +21,7 @@ final class WhisperEngine { private var isProcessing = false var onTranscript: ((String) -> Void)? + var language: String = "en" static let defaultModelName = "ggml-base.bin" @@ -127,7 +128,7 @@ final class WhisperEngine { params.print_special = false params.no_context = true params.single_segment = false - let langStr = strdup("en") + let langStr = strdup(self.language) params.language = UnsafePointer(langStr) params.n_threads = 4