feat: handle transcription in real time

2026-01-29 10:31:40 -08:00
parent df8a89e05d
commit e6c19b589e
4 changed files with 166 additions and 5 deletions
@@ -387,6 +387,35 @@ async fn transcribe_chunk(
    Ok(segments)
 }
 /// Get the next chunk of audio for real-time transcription.
 /// Returns the audio chunk and the new offset to use for the next call.
 #[tauri::command]
 async fn get_audio_chunk(
    state: State<'_, AppState>,
    last_offset: usize,
 ) -> Result<(Vec<f32>, usize), String> {
    let audio_guard = state.audio_capture.lock();
    if let Some(ref capture) = *audio_guard {
        Ok(capture.extract_chunk(last_offset))
    } else {
        Err("No active recording".to_string())
    }
 }
 /// Get remaining audio without modifying the buffer (for final processing).
 #[tauri::command]
 async fn get_remaining_audio(
    state: State<'_, AppState>,
    last_offset: usize,
 ) -> Result<Vec<f32>, String> {
    let audio_guard = state.audio_capture.lock();
    if let Some(ref capture) = *audio_guard {
        Ok(capture.get_remaining_audio(last_offset))
    } else {
        Err("No active recording".to_string())
    }
 }
 /// Generate a summary from a transcript.
 #[tauri::command]
 async fn summarize(
@@ -449,6 +478,8 @@ pub fn run() {
            start_recording,
            stop_recording,
            transcribe_chunk,
            get_audio_chunk,
            get_remaining_audio,
            summarize,
            get_backend_logs,
            check_ready,
@@ -196,6 +196,51 @@ impl AudioCapture {
        let sample_count = self.buffer.lock().len();
        sample_count as f32 / WHISPER_SAMPLE_RATE as f32
    }
    /// Extract audio chunk for real-time processing.
    /// This method retrieves audio starting from the given offset and returns only
    /// the new samples, keeping a small overlap for context.
    pub fn extract_chunk(&self, from_sample: usize) -> (Vec<f32>, usize) {
        let mut buffer = self.buffer.lock();
        let current_len = buffer.len();
        // If we don't have enough new samples, return empty
        if from_sample >= current_len {
            return (Vec::new(), current_len);
        }
        // Extract new samples
        let chunk: Vec<f32> = buffer[from_sample..].to_vec();
        // Keep only the last 30 seconds of audio (at 16kHz) to prevent memory exhaustion
        // This provides enough overlap for context while limiting memory usage
        const MAX_BUFFER_SECONDS: usize = 30;
        const MAX_BUFFER_SAMPLES: usize = WHISPER_SAMPLE_RATE as usize * MAX_BUFFER_SECONDS;
        if buffer.len() > MAX_BUFFER_SAMPLES {
            // Calculate how many samples to remove from the beginning
            let samples_to_remove = buffer.len() - MAX_BUFFER_SAMPLES;
            buffer.drain(..samples_to_remove);
            // Return the chunk and adjust the offset
            return (chunk, current_len - samples_to_remove);
        }
        // Return the chunk and the new offset
        (chunk, current_len)
    }
    /// Get all audio samples from the given offset without modifying the buffer.
    /// This is used when stopping recording to get any remaining audio.
    pub fn get_remaining_audio(&self, from_sample: usize) -> Vec<f32> {
        let buffer = self.buffer.lock();
        if from_sample >= buffer.len() {
            return Vec::new();
        }
        buffer[from_sample..].to_vec()
    }
 }
 impl Default for AudioCapture {
@@ -175,6 +175,12 @@ body {
  animation: pulse 1.5s infinite;
 }
 .real-time-indicator {
  font-size: 0.875rem;
  color: var(--secondary-color);
  font-style: italic;
 }
 .recording-duration {
  font-weight: 600;
  font-variant-numeric: tabular-nums;
@@ -25,6 +25,9 @@ function App() {
  const [recordingDuration, setRecordingDuration] = useState(0);
  const initStarted = useRef(false);
  const recordingTimer = useRef<number | null>(null);
  const transcriptionTimer = useRef<number | null>(null);
  const audioOffset = useRef(0);
  const totalProcessedSamples = useRef(0);
  useEffect(() => {
    if (initStarted.current) return;
@@ -33,12 +36,15 @@ function App() {
    initializeApp();
  }, []);
-  // Cleanup timer on unmount
+  // Cleanup timers on unmount
  useEffect(() => {
    return () => {
      if (recordingTimer.current) {
        clearInterval(recordingTimer.current);
      }
      if (transcriptionTimer.current) {
        clearInterval(transcriptionTimer.current);
      }
    };
  }, []);
@@ -93,12 +99,51 @@ function App() {
    initializeApp();
  };
  const processAudioChunk = async () => {
    try {
      // Get the next chunk of audio
      const [audioChunk, newOffset] = await invoke<[number[], number]>("get_audio_chunk", {
        lastOffset: audioOffset.current
      });
      // If we have enough audio (at least 5 seconds worth at 16kHz)
      if (audioChunk.length >= 5 * 16000) {
        // Transcribe the chunk
        const newSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
          audioData: audioChunk
        });
        if (newSegments.length > 0) {
          // Calculate timestamps based on total processed samples
          const baseTime = totalProcessedSamples.current / 16000;
          const adjustedSegments = newSegments.map(seg => ({
            ...seg,
            start: seg.start + baseTime,
            end: seg.end + baseTime,
          }));
          setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
        }
        // Track total processed samples
        totalProcessedSamples.current += audioChunk.length;
        // Update the offset for next time
        audioOffset.current = newOffset;
      }
    } catch (error) {
      console.error("Failed to process audio chunk:", error);
    }
  };
  const startRecording = async () => {
    try {
      setAppState("recording");
      setRecordingDuration(0);
      setTranscriptSegments([]);
      setSummary(null);
      audioOffset.current = 0;
      totalProcessedSamples.current = 0;
      await invoke("start_recording");
@@ -106,6 +151,11 @@ function App() {
      recordingTimer.current = window.setInterval(() => {
        setRecordingDuration(d => d + 1);
      }, 1000);
      // Start real-time transcription timer (every 5 seconds)
      transcriptionTimer.current = window.setInterval(() => {
        processAudioChunk();
      }, 5000);
    } catch (error) {
      console.error("Failed to start recording:", error);
      setAppState("ready");
@@ -115,17 +165,43 @@ function App() {
  const stopRecording = async () => {
    try {
-      // Stop the timer
+      // Stop the timers
      if (recordingTimer.current) {
        clearInterval(recordingTimer.current);
        recordingTimer.current = null;
      }
      if (transcriptionTimer.current) {
        clearInterval(transcriptionTimer.current);
        transcriptionTimer.current = null;
      }
      setAppState("transcribing");
-      setStatusMessage("Transcribing audio...");
+      setStatusMessage("Processing final audio...");
-      const segments = await invoke<TranscriptSegment[]>("stop_recording");
+      // Process any remaining audio
-      setTranscriptSegments(segments);
+      const finalChunk = await invoke<number[]>("get_remaining_audio", {
        lastOffset: audioOffset.current
      });
      if (finalChunk.length > 0) {
        const finalSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
          audioData: finalChunk
        });
        if (finalSegments.length > 0) {
          const baseTime = totalProcessedSamples.current / 16000;
          const adjustedSegments = finalSegments.map(seg => ({
            ...seg,
            start: seg.start + baseTime,
            end: seg.end + baseTime,
          }));
          setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
        }
      }
      // Stop the recording
      await invoke("stop_recording");
      setAppState("ready");
      setStatusMessage("");
@@ -264,6 +340,9 @@ function App() {
          <div className="recording-indicator">
            <span className="recording-dot" />
            Recording: {formatDuration(recordingDuration)}
            {transcriptSegments.length > 0 && (
              <span className="real-time-indicator"> (Real-time transcription active)</span>
            )}
          </div>
          <button className="stop-button" onClick={stopRecording}>
            ⏹️ Stop Recording