From e6c19b589e54af4204da325cd3794b41bcc084cc Mon Sep 17 00:00:00 2001
From: Naomi Carrigan <commits@nhcarrigan.com>
Date: Thu, 29 Jan 2026 10:31:40 -0800
Subject: [PATCH] feat: handle transcription in real time

---
 src-tauri/src/lib.rs      | 31 ++++++++++++++
 src-tauri/src/ml/audio.rs | 45 ++++++++++++++++++++
 src/App.css               |  6 +++
 src/App.tsx               | 89 ++++++++++++++++++++++++++++++++++++---
 4 files changed, 166 insertions(+), 5 deletions(-)
diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs
index 1f15b58..3877194 100644
--- a/src-tauri/src/lib.rs
+++ b/src-tauri/src/lib.rs
@@ -387,6 +387,35 @@ async fn transcribe_chunk(
     Ok(segments)
 }
 
+/// Get the next chunk of audio for real-time transcription.
+/// Returns the audio chunk and the new offset to use for the next call.
+#[tauri::command]
+async fn get_audio_chunk(
+    state: State<'_, AppState>,
+    last_offset: usize,
+) -> Result<(Vec<f32>, usize), String> {
+    let audio_guard = state.audio_capture.lock();
+    if let Some(ref capture) = *audio_guard {
+        Ok(capture.extract_chunk(last_offset))
+    } else {
+        Err("No active recording".to_string())
+    }
+}
+
+/// Get remaining audio without modifying the buffer (for final processing).
+#[tauri::command]
+async fn get_remaining_audio(
+    state: State<'_, AppState>,
+    last_offset: usize,
+) -> Result<Vec<f32>, String> {
+    let audio_guard = state.audio_capture.lock();
+    if let Some(ref capture) = *audio_guard {
+        Ok(capture.get_remaining_audio(last_offset))
+    } else {
+        Err("No active recording".to_string())
+    }
+}
+
 /// Generate a summary from a transcript.
 #[tauri::command]
 async fn summarize(
@@ -449,6 +478,8 @@ pub fn run() {
             start_recording,
             stop_recording,
             transcribe_chunk,
+            get_audio_chunk,
+            get_remaining_audio,
             summarize,
             get_backend_logs,
             check_ready,
diff --git a/src-tauri/src/ml/audio.rs b/src-tauri/src/ml/audio.rs
index 67ea1ae..8eaa5b5 100644
--- a/src-tauri/src/ml/audio.rs
+++ b/src-tauri/src/ml/audio.rs
@@ -196,6 +196,51 @@ impl AudioCapture {
         let sample_count = self.buffer.lock().len();
         sample_count as f32 / WHISPER_SAMPLE_RATE as f32
     }
+
+    /// Extract audio chunk for real-time processing.
+    /// This method retrieves audio starting from the given offset and returns only
+    /// the new samples, keeping a small overlap for context.
+    pub fn extract_chunk(&self, from_sample: usize) -> (Vec<f32>, usize) {
+        let mut buffer = self.buffer.lock();
+        let current_len = buffer.len();
+
+        // If we don't have enough new samples, return empty
+        if from_sample >= current_len {
+            return (Vec::new(), current_len);
+        }
+
+        // Extract new samples
+        let chunk: Vec<f32> = buffer[from_sample..].to_vec();
+
+        // Keep only the last 30 seconds of audio (at 16kHz) to prevent memory exhaustion
+        // This provides enough overlap for context while limiting memory usage
+        const MAX_BUFFER_SECONDS: usize = 30;
+        const MAX_BUFFER_SAMPLES: usize = WHISPER_SAMPLE_RATE as usize * MAX_BUFFER_SECONDS;
+
+        if buffer.len() > MAX_BUFFER_SAMPLES {
+            // Calculate how many samples to remove from the beginning
+            let samples_to_remove = buffer.len() - MAX_BUFFER_SAMPLES;
+            buffer.drain(..samples_to_remove);
+
+            // Return the chunk and adjust the offset
+            return (chunk, current_len - samples_to_remove);
+        }
+
+        // Return the chunk and the new offset
+        (chunk, current_len)
+    }
+
+    /// Get all audio samples from the given offset without modifying the buffer.
+    /// This is used when stopping recording to get any remaining audio.
+    pub fn get_remaining_audio(&self, from_sample: usize) -> Vec<f32> {
+        let buffer = self.buffer.lock();
+
+        if from_sample >= buffer.len() {
+            return Vec::new();
+        }
+
+        buffer[from_sample..].to_vec()
+    }
 }
 
 impl Default for AudioCapture {
diff --git a/src/App.css b/src/App.css
index cf5c64c..cecb7c9 100644
--- a/src/App.css
+++ b/src/App.css
@@ -175,6 +175,12 @@ body {
   animation: pulse 1.5s infinite;
 }
 
+.real-time-indicator {
+  font-size: 0.875rem;
+  color: var(--secondary-color);
+  font-style: italic;
+}
+
 .recording-duration {
   font-weight: 600;
   font-variant-numeric: tabular-nums;
diff --git a/src/App.tsx b/src/App.tsx
index 9a6c365..7c53bd9 100644
--- a/src/App.tsx
+++ b/src/App.tsx
@@ -25,6 +25,9 @@ function App() {
   const [recordingDuration, setRecordingDuration] = useState(0);
   const initStarted = useRef(false);
   const recordingTimer = useRef<number | null>(null);
+  const transcriptionTimer = useRef<number | null>(null);
+  const audioOffset = useRef(0);
+  const totalProcessedSamples = useRef(0);
 
   useEffect(() => {
     if (initStarted.current) return;
@@ -33,12 +36,15 @@ function App() {
     initializeApp();
   }, []);
 
-  // Cleanup timer on unmount
+  // Cleanup timers on unmount
   useEffect(() => {
     return () => {
       if (recordingTimer.current) {
         clearInterval(recordingTimer.current);
       }
+      if (transcriptionTimer.current) {
+        clearInterval(transcriptionTimer.current);
+      }
     };
   }, []);
 
@@ -93,12 +99,51 @@ function App() {
     initializeApp();
   };
 
+  const processAudioChunk = async () => {
+    try {
+      // Get the next chunk of audio
+      const [audioChunk, newOffset] = await invoke<[number[], number]>("get_audio_chunk", {
+        lastOffset: audioOffset.current
+      });
+
+      // If we have enough audio (at least 5 seconds worth at 16kHz)
+      if (audioChunk.length >= 5 * 16000) {
+        // Transcribe the chunk
+        const newSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
+          audioData: audioChunk
+        });
+
+        if (newSegments.length > 0) {
+          // Calculate timestamps based on total processed samples
+          const baseTime = totalProcessedSamples.current / 16000;
+          const adjustedSegments = newSegments.map(seg => ({
+            ...seg,
+            start: seg.start + baseTime,
+            end: seg.end + baseTime,
+          }));
+
+          setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
+        }
+
+        // Track total processed samples
+        totalProcessedSamples.current += audioChunk.length;
+
+        // Update the offset for next time
+        audioOffset.current = newOffset;
+      }
+    } catch (error) {
+      console.error("Failed to process audio chunk:", error);
+    }
+  };
+
   const startRecording = async () => {
     try {
       setAppState("recording");
       setRecordingDuration(0);
       setTranscriptSegments([]);
       setSummary(null);
+      audioOffset.current = 0;
+      totalProcessedSamples.current = 0;
 
       await invoke("start_recording");
 
@@ -106,6 +151,11 @@ function App() {
       recordingTimer.current = window.setInterval(() => {
         setRecordingDuration(d => d + 1);
       }, 1000);
+
+      // Start real-time transcription timer (every 5 seconds)
+      transcriptionTimer.current = window.setInterval(() => {
+        processAudioChunk();
+      }, 5000);
     } catch (error) {
       console.error("Failed to start recording:", error);
       setAppState("ready");
@@ -115,17 +165,43 @@ function App() {
 
   const stopRecording = async () => {
     try {
-      // Stop the timer
+      // Stop the timers
       if (recordingTimer.current) {
         clearInterval(recordingTimer.current);
         recordingTimer.current = null;
       }
+      if (transcriptionTimer.current) {
+        clearInterval(transcriptionTimer.current);
+        transcriptionTimer.current = null;
+      }
 
       setAppState("transcribing");
-      setStatusMessage("Transcribing audio...");
+      setStatusMessage("Processing final audio...");
 
-      const segments = await invoke<TranscriptSegment[]>("stop_recording");
-      setTranscriptSegments(segments);
+      // Process any remaining audio
+      const finalChunk = await invoke<number[]>("get_remaining_audio", {
+        lastOffset: audioOffset.current
+      });
+
+      if (finalChunk.length > 0) {
+        const finalSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
+          audioData: finalChunk
+        });
+
+        if (finalSegments.length > 0) {
+          const baseTime = totalProcessedSamples.current / 16000;
+          const adjustedSegments = finalSegments.map(seg => ({
+            ...seg,
+            start: seg.start + baseTime,
+            end: seg.end + baseTime,
+          }));
+
+          setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
+        }
+      }
+
+      // Stop the recording
+      await invoke("stop_recording");
 
       setAppState("ready");
       setStatusMessage("");
@@ -264,6 +340,9 @@ function App() {
           <div className="recording-indicator">
             <span className="recording-dot" />
             Recording: {formatDuration(recordingDuration)}
+            {transcriptSegments.length > 0 && (
+              <span className="real-time-indicator"> (Real-time transcription active)</span>
+            )}
           </div>
           <button className="stop-button" onClick={stopRecording}>
             ⏹️ Stop Recording