From 0c22e3df7c4d5965c8dcb79c5296c609696fb263 Mon Sep 17 00:00:00 2001 From: Niklas Buse Date: Mon, 15 Jun 2026 18:32:16 +0200 Subject: [PATCH] fix: audio export quality and Whisper language selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - SystemAudioCapture: raise sample rate from 16 kHz to 48 kHz so exported audio captures the full voice frequency range (0–24 kHz) instead of being limited to 8 kHz (Nyquist of 16 kHz) - MeetingRecorder: update audio file settings to 48 kHz / 128 kbps AAC; write original 48 kHz PCM to the audio file in handleSystemAudioBuffer instead of the already-downsampled 16 kHz buffer that was fed to SFSpeech; fix writeMicAudio memcpy fast-path to also trigger for stereo hardware input (was gated on channelCount == 1 unnecessarily) - WhisperEngine: add `language` property (default "en"), use it in transcribeChunk instead of a hardcoded language string; set it from MeetingRecorder.startRecording() via the 2-letter ISO prefix of recognitionLanguage (e.g. "de-DE" → "de") Fixes #2, fixes #3 --- Sources/Scripta/MeetingRecorder.swift | 13 ++++++++----- Sources/Scripta/SystemAudioCapture.swift | 2 +- Sources/Scripta/WhisperEngine.swift | 3 ++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/Sources/Scripta/MeetingRecorder.swift b/Sources/Scripta/MeetingRecorder.swift index 3856a3b..4da2e99 100644 --- a/Sources/Scripta/MeetingRecorder.swift +++ b/Sources/Scripta/MeetingRecorder.swift @@ -118,6 +118,7 @@ final class MeetingRecorder: NSObject, ObservableObject { } } + whisperEngine.language = recognitionLanguage.components(separatedBy: "-").first?.lowercased() ?? "en" whisperEngine.onTranscript = { [weak self] text in guard let self, self.state == .recording, !self.micMuted else { return } self.appendWhisperTranscript(text) @@ -309,6 +310,10 @@ final class MeetingRecorder: NSObject, ObservableObject { mplog("System audio: buffer #\(systemBufferCount) frames=\(pcm.frameLength) rate=\(pcm.format.sampleRate) ch=\(pcm.format.channelCount) fmt=\(pcm.format.commonFormat.rawValue)") } + // Write original quality (48 kHz) to the audio file. + writeSystemAudio(pcm) + + // Downsample to 16 kHz only for SFSpeech recognition. let buffer: AVAudioPCMBuffer if pcm.format.sampleRate == recognitionFormat.sampleRate && pcm.format.channelCount == recognitionFormat.channelCount && @@ -319,8 +324,6 @@ final class MeetingRecorder: NSObject, ObservableObject { buffer = converted } - writeSystemAudio(buffer) - if !systemRecognitionStarted && state == .recording { systemRecognitionStarted = true DispatchQueue.main.async { [weak self] in @@ -676,9 +679,9 @@ final class MeetingRecorder: NSObject, ObservableObject { private static let audioFileSettings: [String: Any] = [ AVFormatIDKey: kAudioFormatMPEG4AAC, - AVSampleRateKey: 16_000, + AVSampleRateKey: 48_000, AVNumberOfChannelsKey: 1, - AVEncoderBitRateKey: 48_000, + AVEncoderBitRateKey: 128_000, ] private func startAudioWriters() { @@ -728,7 +731,7 @@ final class MeetingRecorder: NSObject, ObservableObject { output.frameLength = outFrames guard let outPtr = output.floatChannelData?[0] else { return } - if abs(ratio - 1.0) < 0.001 && buffer.format.channelCount == 1 { + if abs(ratio - 1.0) < 0.001 { memcpy(outPtr, ch0, Int(frames) * MemoryLayout.size) } else { let srcCount = Int(frames) diff --git a/Sources/Scripta/SystemAudioCapture.swift b/Sources/Scripta/SystemAudioCapture.swift index f4f71a5..429e158 100644 --- a/Sources/Scripta/SystemAudioCapture.swift +++ b/Sources/Scripta/SystemAudioCapture.swift @@ -61,7 +61,7 @@ final class SystemAudioCapture: NSObject, SCStreamDelegate, SCStreamOutput { let config = SCStreamConfiguration() config.capturesAudio = true config.excludesCurrentProcessAudio = true - config.sampleRate = 16_000 + config.sampleRate = 48_000 config.channelCount = 1 config.queueDepth = 8 config.width = 2 diff --git a/Sources/Scripta/WhisperEngine.swift b/Sources/Scripta/WhisperEngine.swift index d8c694a..14831ba 100644 --- a/Sources/Scripta/WhisperEngine.swift +++ b/Sources/Scripta/WhisperEngine.swift @@ -21,6 +21,7 @@ final class WhisperEngine { private var isProcessing = false var onTranscript: ((String) -> Void)? + var language: String = "en" static let defaultModelName = "ggml-base.bin" @@ -127,7 +128,7 @@ final class WhisperEngine { params.print_special = false params.no_context = true params.single_segment = false - let langStr = strdup("en") + let langStr = strdup(self.language) params.language = UnsafePointer(langStr) params.n_threads = 4