From e6c19b589e54af4204da325cd3794b41bcc084cc Mon Sep 17 00:00:00 2001 From: Naomi Carrigan Date: Thu, 29 Jan 2026 10:31:40 -0800 Subject: [PATCH] feat: handle transcription in real time --- src-tauri/src/lib.rs | 31 ++++++++++++++ src-tauri/src/ml/audio.rs | 45 ++++++++++++++++++++ src/App.css | 6 +++ src/App.tsx | 89 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 166 insertions(+), 5 deletions(-) diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 1f15b58..3877194 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -387,6 +387,35 @@ async fn transcribe_chunk( Ok(segments) } +/// Get the next chunk of audio for real-time transcription. +/// Returns the audio chunk and the new offset to use for the next call. +#[tauri::command] +async fn get_audio_chunk( + state: State<'_, AppState>, + last_offset: usize, +) -> Result<(Vec, usize), String> { + let audio_guard = state.audio_capture.lock(); + if let Some(ref capture) = *audio_guard { + Ok(capture.extract_chunk(last_offset)) + } else { + Err("No active recording".to_string()) + } +} + +/// Get remaining audio without modifying the buffer (for final processing). +#[tauri::command] +async fn get_remaining_audio( + state: State<'_, AppState>, + last_offset: usize, +) -> Result, String> { + let audio_guard = state.audio_capture.lock(); + if let Some(ref capture) = *audio_guard { + Ok(capture.get_remaining_audio(last_offset)) + } else { + Err("No active recording".to_string()) + } +} + /// Generate a summary from a transcript. #[tauri::command] async fn summarize( @@ -449,6 +478,8 @@ pub fn run() { start_recording, stop_recording, transcribe_chunk, + get_audio_chunk, + get_remaining_audio, summarize, get_backend_logs, check_ready, diff --git a/src-tauri/src/ml/audio.rs b/src-tauri/src/ml/audio.rs index 67ea1ae..8eaa5b5 100644 --- a/src-tauri/src/ml/audio.rs +++ b/src-tauri/src/ml/audio.rs @@ -196,6 +196,51 @@ impl AudioCapture { let sample_count = self.buffer.lock().len(); sample_count as f32 / WHISPER_SAMPLE_RATE as f32 } + + /// Extract audio chunk for real-time processing. + /// This method retrieves audio starting from the given offset and returns only + /// the new samples, keeping a small overlap for context. + pub fn extract_chunk(&self, from_sample: usize) -> (Vec, usize) { + let mut buffer = self.buffer.lock(); + let current_len = buffer.len(); + + // If we don't have enough new samples, return empty + if from_sample >= current_len { + return (Vec::new(), current_len); + } + + // Extract new samples + let chunk: Vec = buffer[from_sample..].to_vec(); + + // Keep only the last 30 seconds of audio (at 16kHz) to prevent memory exhaustion + // This provides enough overlap for context while limiting memory usage + const MAX_BUFFER_SECONDS: usize = 30; + const MAX_BUFFER_SAMPLES: usize = WHISPER_SAMPLE_RATE as usize * MAX_BUFFER_SECONDS; + + if buffer.len() > MAX_BUFFER_SAMPLES { + // Calculate how many samples to remove from the beginning + let samples_to_remove = buffer.len() - MAX_BUFFER_SAMPLES; + buffer.drain(..samples_to_remove); + + // Return the chunk and adjust the offset + return (chunk, current_len - samples_to_remove); + } + + // Return the chunk and the new offset + (chunk, current_len) + } + + /// Get all audio samples from the given offset without modifying the buffer. + /// This is used when stopping recording to get any remaining audio. + pub fn get_remaining_audio(&self, from_sample: usize) -> Vec { + let buffer = self.buffer.lock(); + + if from_sample >= buffer.len() { + return Vec::new(); + } + + buffer[from_sample..].to_vec() + } } impl Default for AudioCapture { diff --git a/src/App.css b/src/App.css index cf5c64c..cecb7c9 100644 --- a/src/App.css +++ b/src/App.css @@ -175,6 +175,12 @@ body { animation: pulse 1.5s infinite; } +.real-time-indicator { + font-size: 0.875rem; + color: var(--secondary-color); + font-style: italic; +} + .recording-duration { font-weight: 600; font-variant-numeric: tabular-nums; diff --git a/src/App.tsx b/src/App.tsx index 9a6c365..7c53bd9 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -25,6 +25,9 @@ function App() { const [recordingDuration, setRecordingDuration] = useState(0); const initStarted = useRef(false); const recordingTimer = useRef(null); + const transcriptionTimer = useRef(null); + const audioOffset = useRef(0); + const totalProcessedSamples = useRef(0); useEffect(() => { if (initStarted.current) return; @@ -33,12 +36,15 @@ function App() { initializeApp(); }, []); - // Cleanup timer on unmount + // Cleanup timers on unmount useEffect(() => { return () => { if (recordingTimer.current) { clearInterval(recordingTimer.current); } + if (transcriptionTimer.current) { + clearInterval(transcriptionTimer.current); + } }; }, []); @@ -93,12 +99,51 @@ function App() { initializeApp(); }; + const processAudioChunk = async () => { + try { + // Get the next chunk of audio + const [audioChunk, newOffset] = await invoke<[number[], number]>("get_audio_chunk", { + lastOffset: audioOffset.current + }); + + // If we have enough audio (at least 5 seconds worth at 16kHz) + if (audioChunk.length >= 5 * 16000) { + // Transcribe the chunk + const newSegments = await invoke("transcribe_chunk", { + audioData: audioChunk + }); + + if (newSegments.length > 0) { + // Calculate timestamps based on total processed samples + const baseTime = totalProcessedSamples.current / 16000; + const adjustedSegments = newSegments.map(seg => ({ + ...seg, + start: seg.start + baseTime, + end: seg.end + baseTime, + })); + + setTranscriptSegments(prev => [...prev, ...adjustedSegments]); + } + + // Track total processed samples + totalProcessedSamples.current += audioChunk.length; + + // Update the offset for next time + audioOffset.current = newOffset; + } + } catch (error) { + console.error("Failed to process audio chunk:", error); + } + }; + const startRecording = async () => { try { setAppState("recording"); setRecordingDuration(0); setTranscriptSegments([]); setSummary(null); + audioOffset.current = 0; + totalProcessedSamples.current = 0; await invoke("start_recording"); @@ -106,6 +151,11 @@ function App() { recordingTimer.current = window.setInterval(() => { setRecordingDuration(d => d + 1); }, 1000); + + // Start real-time transcription timer (every 5 seconds) + transcriptionTimer.current = window.setInterval(() => { + processAudioChunk(); + }, 5000); } catch (error) { console.error("Failed to start recording:", error); setAppState("ready"); @@ -115,17 +165,43 @@ function App() { const stopRecording = async () => { try { - // Stop the timer + // Stop the timers if (recordingTimer.current) { clearInterval(recordingTimer.current); recordingTimer.current = null; } + if (transcriptionTimer.current) { + clearInterval(transcriptionTimer.current); + transcriptionTimer.current = null; + } setAppState("transcribing"); - setStatusMessage("Transcribing audio..."); + setStatusMessage("Processing final audio..."); - const segments = await invoke("stop_recording"); - setTranscriptSegments(segments); + // Process any remaining audio + const finalChunk = await invoke("get_remaining_audio", { + lastOffset: audioOffset.current + }); + + if (finalChunk.length > 0) { + const finalSegments = await invoke("transcribe_chunk", { + audioData: finalChunk + }); + + if (finalSegments.length > 0) { + const baseTime = totalProcessedSamples.current / 16000; + const adjustedSegments = finalSegments.map(seg => ({ + ...seg, + start: seg.start + baseTime, + end: seg.end + baseTime, + })); + + setTranscriptSegments(prev => [...prev, ...adjustedSegments]); + } + } + + // Stop the recording + await invoke("stop_recording"); setAppState("ready"); setStatusMessage(""); @@ -264,6 +340,9 @@ function App() {
Recording: {formatDuration(recordingDuration)} + {transcriptSegments.length > 0 && ( + (Real-time transcription active) + )}