feat: handle transcription in real time

2026-01-29 10:31:40 -08:00
parent df8a89e05d
commit e6c19b589e
4 changed files with 166 additions and 5 deletions
@@ -387,6 +387,35 @@ async fn transcribe_chunk(
    Ok(segments)
 }

+/// Get the next chunk of audio for real-time transcription.
+/// Returns the audio chunk and the new offset to use for the next call.
+#[tauri::command]
+async fn get_audio_chunk(
+    state: State<'_, AppState>,
+    last_offset: usize,
+) -> Result<(Vec<f32>, usize), String> {
+    let audio_guard = state.audio_capture.lock();
+    if let Some(ref capture) = *audio_guard {
+        Ok(capture.extract_chunk(last_offset))
+    } else {
+        Err("No active recording".to_string())
+    }
+}
+
+/// Get remaining audio without modifying the buffer (for final processing).
+#[tauri::command]
+async fn get_remaining_audio(
+    state: State<'_, AppState>,
+    last_offset: usize,
+) -> Result<Vec<f32>, String> {
+    let audio_guard = state.audio_capture.lock();
+    if let Some(ref capture) = *audio_guard {
+        Ok(capture.get_remaining_audio(last_offset))
+    } else {
+        Err("No active recording".to_string())
+    }
+}
+
 /// Generate a summary from a transcript.
 #[tauri::command]
 async fn summarize(
@@ -449,6 +478,8 @@ pub fn run() {
            start_recording,
            stop_recording,
            transcribe_chunk,
+            get_audio_chunk,
+            get_remaining_audio,
            summarize,
            get_backend_logs,
            check_ready,
@@ -196,6 +196,51 @@ impl AudioCapture {
        let sample_count = self.buffer.lock().len();
        sample_count as f32 / WHISPER_SAMPLE_RATE as f32
    }
+
+    /// Extract audio chunk for real-time processing.
+    /// This method retrieves audio starting from the given offset and returns only
+    /// the new samples, keeping a small overlap for context.
+    pub fn extract_chunk(&self, from_sample: usize) -> (Vec<f32>, usize) {
+        let mut buffer = self.buffer.lock();
+        let current_len = buffer.len();
+
+        // If we don't have enough new samples, return empty
+        if from_sample >= current_len {
+            return (Vec::new(), current_len);
+        }
+
+        // Extract new samples
+        let chunk: Vec<f32> = buffer[from_sample..].to_vec();
+
+        // Keep only the last 30 seconds of audio (at 16kHz) to prevent memory exhaustion
+        // This provides enough overlap for context while limiting memory usage
+        const MAX_BUFFER_SECONDS: usize = 30;
+        const MAX_BUFFER_SAMPLES: usize = WHISPER_SAMPLE_RATE as usize * MAX_BUFFER_SECONDS;
+
+        if buffer.len() > MAX_BUFFER_SAMPLES {
+            // Calculate how many samples to remove from the beginning
+            let samples_to_remove = buffer.len() - MAX_BUFFER_SAMPLES;
+            buffer.drain(..samples_to_remove);
+
+            // Return the chunk and adjust the offset
+            return (chunk, current_len - samples_to_remove);
+        }
+
+        // Return the chunk and the new offset
+        (chunk, current_len)
+    }
+
+    /// Get all audio samples from the given offset without modifying the buffer.
+    /// This is used when stopping recording to get any remaining audio.
+    pub fn get_remaining_audio(&self, from_sample: usize) -> Vec<f32> {
+        let buffer = self.buffer.lock();
+
+        if from_sample >= buffer.len() {
+            return Vec::new();
+        }
+
+        buffer[from_sample..].to_vec()
+    }
 }

 impl Default for AudioCapture {
@@ -175,6 +175,12 @@ body {
  animation: pulse 1.5s infinite;
 }

+.real-time-indicator {
+  font-size: 0.875rem;
+  color: var(--secondary-color);
+  font-style: italic;
+}
+
 .recording-duration {
  font-weight: 600;
  font-variant-numeric: tabular-nums;
@@ -25,6 +25,9 @@ function App() {
  const [recordingDuration, setRecordingDuration] = useState(0);
  const initStarted = useRef(false);
  const recordingTimer = useRef<number | null>(null);
+  const transcriptionTimer = useRef<number | null>(null);
+  const audioOffset = useRef(0);
+  const totalProcessedSamples = useRef(0);

  useEffect(() => {
    if (initStarted.current) return;
@@ -33,12 +36,15 @@ function App() {
    initializeApp();
  }, []);

-  // Cleanup timer on unmount
+  // Cleanup timers on unmount
  useEffect(() => {
    return () => {
      if (recordingTimer.current) {
        clearInterval(recordingTimer.current);
      }
+      if (transcriptionTimer.current) {
+        clearInterval(transcriptionTimer.current);
+      }
    };
  }, []);

@@ -93,12 +99,51 @@ function App() {
    initializeApp();
  };

+  const processAudioChunk = async () => {
+    try {
+      // Get the next chunk of audio
+      const [audioChunk, newOffset] = await invoke<[number[], number]>("get_audio_chunk", {
+        lastOffset: audioOffset.current
+      });
+
+      // If we have enough audio (at least 5 seconds worth at 16kHz)
+      if (audioChunk.length >= 5 * 16000) {
+        // Transcribe the chunk
+        const newSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
+          audioData: audioChunk
+        });
+
+        if (newSegments.length > 0) {
+          // Calculate timestamps based on total processed samples
+          const baseTime = totalProcessedSamples.current / 16000;
+          const adjustedSegments = newSegments.map(seg => ({
+            ...seg,
+            start: seg.start + baseTime,
+            end: seg.end + baseTime,
+          }));
+
+          setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
+        }
+
+        // Track total processed samples
+        totalProcessedSamples.current += audioChunk.length;
+
+        // Update the offset for next time
+        audioOffset.current = newOffset;
+      }
+    } catch (error) {
+      console.error("Failed to process audio chunk:", error);
+    }
+  };
+
  const startRecording = async () => {
    try {
      setAppState("recording");
      setRecordingDuration(0);
      setTranscriptSegments([]);
      setSummary(null);
+      audioOffset.current = 0;
+      totalProcessedSamples.current = 0;

      await invoke("start_recording");

@@ -106,6 +151,11 @@ function App() {
      recordingTimer.current = window.setInterval(() => {
        setRecordingDuration(d => d + 1);
      }, 1000);
+
+      // Start real-time transcription timer (every 5 seconds)
+      transcriptionTimer.current = window.setInterval(() => {
+        processAudioChunk();
+      }, 5000);
    } catch (error) {
      console.error("Failed to start recording:", error);
      setAppState("ready");
@@ -115,17 +165,43 @@ function App() {

  const stopRecording = async () => {
    try {
-      // Stop the timer
+      // Stop the timers
      if (recordingTimer.current) {
        clearInterval(recordingTimer.current);
        recordingTimer.current = null;
      }
+      if (transcriptionTimer.current) {
+        clearInterval(transcriptionTimer.current);
+        transcriptionTimer.current = null;
+      }

      setAppState("transcribing");
-      setStatusMessage("Transcribing audio...");
+      setStatusMessage("Processing final audio...");

-      const segments = await invoke<TranscriptSegment[]>("stop_recording");
-      setTranscriptSegments(segments);
+      // Process any remaining audio
+      const finalChunk = await invoke<number[]>("get_remaining_audio", {
+        lastOffset: audioOffset.current
+      });
+
+      if (finalChunk.length > 0) {
+        const finalSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
+          audioData: finalChunk
+        });
+
+        if (finalSegments.length > 0) {
+          const baseTime = totalProcessedSamples.current / 16000;
+          const adjustedSegments = finalSegments.map(seg => ({
+            ...seg,
+            start: seg.start + baseTime,
+            end: seg.end + baseTime,
+          }));
+
+          setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
+        }
+      }
+
+      // Stop the recording
+      await invoke("stop_recording");

      setAppState("ready");
      setStatusMessage("");
@@ -264,6 +340,9 @@ function App() {
          <div className="recording-indicator">
            <span className="recording-dot" />
            Recording: {formatDuration(recordingDuration)}
+            {transcriptSegments.length > 0 && (
+              <span className="real-time-indicator"> (Real-time transcription active)</span>
+            )}
          </div>
          <button className="stop-button" onClick={stopRecording}>
            ⏹️ Stop Recording