From 0c22e3df7c4d5965c8dcb79c5296c609696fb263 Mon Sep 17 00:00:00 2001
From: Niklas Buse <nicki_bu@gmx.de>
Date: Mon, 15 Jun 2026 18:32:16 +0200
Subject: [PATCH] fix: audio export quality and Whisper language selection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- SystemAudioCapture: raise sample rate from 16 kHz to 48 kHz so
  exported audio captures the full voice frequency range (0–24 kHz)
  instead of being limited to 8 kHz (Nyquist of 16 kHz)

- MeetingRecorder: update audio file settings to 48 kHz / 128 kbps AAC;
  write original 48 kHz PCM to the audio file in handleSystemAudioBuffer
  instead of the already-downsampled 16 kHz buffer that was fed to
  SFSpeech; fix writeMicAudio memcpy fast-path to also trigger for
  stereo hardware input (was gated on channelCount == 1 unnecessarily)

- WhisperEngine: add `language` property (default "en"), use it in
  transcribeChunk instead of a hardcoded language string; set it from
  MeetingRecorder.startRecording() via the 2-letter ISO prefix of
  recognitionLanguage (e.g. "de-DE" → "de")

Fixes #2, fixes #3
---
 Sources/Scripta/MeetingRecorder.swift    | 13 ++++++++-----
 Sources/Scripta/SystemAudioCapture.swift |  2 +-
 Sources/Scripta/WhisperEngine.swift      |  3 ++-
 3 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/Sources/Scripta/MeetingRecorder.swift b/Sources/Scripta/MeetingRecorder.swift
index 3856a3b..4da2e99 100644
--- a/Sources/Scripta/MeetingRecorder.swift
+++ b/Sources/Scripta/MeetingRecorder.swift
@@ -118,6 +118,7 @@ final class MeetingRecorder: NSObject, ObservableObject {
             }
         }
 
+        whisperEngine.language = recognitionLanguage.components(separatedBy: "-").first?.lowercased() ?? "en"
         whisperEngine.onTranscript = { [weak self] text in
             guard let self, self.state == .recording, !self.micMuted else { return }
             self.appendWhisperTranscript(text)
@@ -309,6 +310,10 @@ final class MeetingRecorder: NSObject, ObservableObject {
             mplog("System audio: buffer #\(systemBufferCount) frames=\(pcm.frameLength) rate=\(pcm.format.sampleRate) ch=\(pcm.format.channelCount) fmt=\(pcm.format.commonFormat.rawValue)")
         }
 
+        // Write original quality (48 kHz) to the audio file.
+        writeSystemAudio(pcm)
+
+        // Downsample to 16 kHz only for SFSpeech recognition.
         let buffer: AVAudioPCMBuffer
         if pcm.format.sampleRate == recognitionFormat.sampleRate &&
             pcm.format.channelCount == recognitionFormat.channelCount &&
@@ -319,8 +324,6 @@ final class MeetingRecorder: NSObject, ObservableObject {
             buffer = converted
         }
 
-        writeSystemAudio(buffer)
-
         if !systemRecognitionStarted && state == .recording {
             systemRecognitionStarted = true
             DispatchQueue.main.async { [weak self] in
@@ -676,9 +679,9 @@ final class MeetingRecorder: NSObject, ObservableObject {
 
     private static let audioFileSettings: [String: Any] = [
         AVFormatIDKey: kAudioFormatMPEG4AAC,
-        AVSampleRateKey: 16_000,
+        AVSampleRateKey: 48_000,
         AVNumberOfChannelsKey: 1,
-        AVEncoderBitRateKey: 48_000,
+        AVEncoderBitRateKey: 128_000,
     ]
 
     private func startAudioWriters() {
@@ -728,7 +731,7 @@ final class MeetingRecorder: NSObject, ObservableObject {
         output.frameLength = outFrames
 
         guard let outPtr = output.floatChannelData?[0] else { return }
-        if abs(ratio - 1.0) < 0.001 && buffer.format.channelCount == 1 {
+        if abs(ratio - 1.0) < 0.001 {
             memcpy(outPtr, ch0, Int(frames) * MemoryLayout<Float>.size)
         } else {
             let srcCount = Int(frames)
diff --git a/Sources/Scripta/SystemAudioCapture.swift b/Sources/Scripta/SystemAudioCapture.swift
index f4f71a5..429e158 100644
--- a/Sources/Scripta/SystemAudioCapture.swift
+++ b/Sources/Scripta/SystemAudioCapture.swift
@@ -61,7 +61,7 @@ final class SystemAudioCapture: NSObject, SCStreamDelegate, SCStreamOutput {
         let config = SCStreamConfiguration()
         config.capturesAudio = true
         config.excludesCurrentProcessAudio = true
-        config.sampleRate = 16_000
+        config.sampleRate = 48_000
         config.channelCount = 1
         config.queueDepth = 8
         config.width = 2
diff --git a/Sources/Scripta/WhisperEngine.swift b/Sources/Scripta/WhisperEngine.swift
index d8c694a..14831ba 100644
--- a/Sources/Scripta/WhisperEngine.swift
+++ b/Sources/Scripta/WhisperEngine.swift
@@ -21,6 +21,7 @@ final class WhisperEngine {
     private var isProcessing = false
 
     var onTranscript: ((String) -> Void)?
+    var language: String = "en"
 
     static let defaultModelName = "ggml-base.bin"
 
@@ -127,7 +128,7 @@ final class WhisperEngine {
             params.print_special = false
             params.no_context = true
             params.single_segment = false
-            let langStr = strdup("en")
+            let langStr = strdup(self.language)
             params.language = UnsafePointer(langStr)
             params.n_threads = 4