feat: handle transcription in real time

This commit is contained in:
2026-01-29 10:31:40 -08:00
parent df8a89e05d
commit e6c19b589e
4 changed files with 166 additions and 5 deletions
+31
View File
@@ -387,6 +387,35 @@ async fn transcribe_chunk(
Ok(segments) Ok(segments)
} }
/// Get the next chunk of audio for real-time transcription.
/// Returns the audio chunk and the new offset to use for the next call.
#[tauri::command]
async fn get_audio_chunk(
state: State<'_, AppState>,
last_offset: usize,
) -> Result<(Vec<f32>, usize), String> {
let audio_guard = state.audio_capture.lock();
if let Some(ref capture) = *audio_guard {
Ok(capture.extract_chunk(last_offset))
} else {
Err("No active recording".to_string())
}
}
/// Get remaining audio without modifying the buffer (for final processing).
#[tauri::command]
async fn get_remaining_audio(
state: State<'_, AppState>,
last_offset: usize,
) -> Result<Vec<f32>, String> {
let audio_guard = state.audio_capture.lock();
if let Some(ref capture) = *audio_guard {
Ok(capture.get_remaining_audio(last_offset))
} else {
Err("No active recording".to_string())
}
}
/// Generate a summary from a transcript. /// Generate a summary from a transcript.
#[tauri::command] #[tauri::command]
async fn summarize( async fn summarize(
@@ -449,6 +478,8 @@ pub fn run() {
start_recording, start_recording,
stop_recording, stop_recording,
transcribe_chunk, transcribe_chunk,
get_audio_chunk,
get_remaining_audio,
summarize, summarize,
get_backend_logs, get_backend_logs,
check_ready, check_ready,
+45
View File
@@ -196,6 +196,51 @@ impl AudioCapture {
let sample_count = self.buffer.lock().len(); let sample_count = self.buffer.lock().len();
sample_count as f32 / WHISPER_SAMPLE_RATE as f32 sample_count as f32 / WHISPER_SAMPLE_RATE as f32
} }
/// Extract audio chunk for real-time processing.
/// This method retrieves audio starting from the given offset and returns only
/// the new samples, keeping a small overlap for context.
pub fn extract_chunk(&self, from_sample: usize) -> (Vec<f32>, usize) {
let mut buffer = self.buffer.lock();
let current_len = buffer.len();
// If we don't have enough new samples, return empty
if from_sample >= current_len {
return (Vec::new(), current_len);
}
// Extract new samples
let chunk: Vec<f32> = buffer[from_sample..].to_vec();
// Keep only the last 30 seconds of audio (at 16kHz) to prevent memory exhaustion
// This provides enough overlap for context while limiting memory usage
const MAX_BUFFER_SECONDS: usize = 30;
const MAX_BUFFER_SAMPLES: usize = WHISPER_SAMPLE_RATE as usize * MAX_BUFFER_SECONDS;
if buffer.len() > MAX_BUFFER_SAMPLES {
// Calculate how many samples to remove from the beginning
let samples_to_remove = buffer.len() - MAX_BUFFER_SAMPLES;
buffer.drain(..samples_to_remove);
// Return the chunk and adjust the offset
return (chunk, current_len - samples_to_remove);
}
// Return the chunk and the new offset
(chunk, current_len)
}
/// Get all audio samples from the given offset without modifying the buffer.
/// This is used when stopping recording to get any remaining audio.
pub fn get_remaining_audio(&self, from_sample: usize) -> Vec<f32> {
let buffer = self.buffer.lock();
if from_sample >= buffer.len() {
return Vec::new();
}
buffer[from_sample..].to_vec()
}
} }
impl Default for AudioCapture { impl Default for AudioCapture {
+6
View File
@@ -175,6 +175,12 @@ body {
animation: pulse 1.5s infinite; animation: pulse 1.5s infinite;
} }
.real-time-indicator {
font-size: 0.875rem;
color: var(--secondary-color);
font-style: italic;
}
.recording-duration { .recording-duration {
font-weight: 600; font-weight: 600;
font-variant-numeric: tabular-nums; font-variant-numeric: tabular-nums;
+84 -5
View File
@@ -25,6 +25,9 @@ function App() {
const [recordingDuration, setRecordingDuration] = useState(0); const [recordingDuration, setRecordingDuration] = useState(0);
const initStarted = useRef(false); const initStarted = useRef(false);
const recordingTimer = useRef<number | null>(null); const recordingTimer = useRef<number | null>(null);
const transcriptionTimer = useRef<number | null>(null);
const audioOffset = useRef(0);
const totalProcessedSamples = useRef(0);
useEffect(() => { useEffect(() => {
if (initStarted.current) return; if (initStarted.current) return;
@@ -33,12 +36,15 @@ function App() {
initializeApp(); initializeApp();
}, []); }, []);
// Cleanup timer on unmount // Cleanup timers on unmount
useEffect(() => { useEffect(() => {
return () => { return () => {
if (recordingTimer.current) { if (recordingTimer.current) {
clearInterval(recordingTimer.current); clearInterval(recordingTimer.current);
} }
if (transcriptionTimer.current) {
clearInterval(transcriptionTimer.current);
}
}; };
}, []); }, []);
@@ -93,12 +99,51 @@ function App() {
initializeApp(); initializeApp();
}; };
const processAudioChunk = async () => {
try {
// Get the next chunk of audio
const [audioChunk, newOffset] = await invoke<[number[], number]>("get_audio_chunk", {
lastOffset: audioOffset.current
});
// If we have enough audio (at least 5 seconds worth at 16kHz)
if (audioChunk.length >= 5 * 16000) {
// Transcribe the chunk
const newSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
audioData: audioChunk
});
if (newSegments.length > 0) {
// Calculate timestamps based on total processed samples
const baseTime = totalProcessedSamples.current / 16000;
const adjustedSegments = newSegments.map(seg => ({
...seg,
start: seg.start + baseTime,
end: seg.end + baseTime,
}));
setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
}
// Track total processed samples
totalProcessedSamples.current += audioChunk.length;
// Update the offset for next time
audioOffset.current = newOffset;
}
} catch (error) {
console.error("Failed to process audio chunk:", error);
}
};
const startRecording = async () => { const startRecording = async () => {
try { try {
setAppState("recording"); setAppState("recording");
setRecordingDuration(0); setRecordingDuration(0);
setTranscriptSegments([]); setTranscriptSegments([]);
setSummary(null); setSummary(null);
audioOffset.current = 0;
totalProcessedSamples.current = 0;
await invoke("start_recording"); await invoke("start_recording");
@@ -106,6 +151,11 @@ function App() {
recordingTimer.current = window.setInterval(() => { recordingTimer.current = window.setInterval(() => {
setRecordingDuration(d => d + 1); setRecordingDuration(d => d + 1);
}, 1000); }, 1000);
// Start real-time transcription timer (every 5 seconds)
transcriptionTimer.current = window.setInterval(() => {
processAudioChunk();
}, 5000);
} catch (error) { } catch (error) {
console.error("Failed to start recording:", error); console.error("Failed to start recording:", error);
setAppState("ready"); setAppState("ready");
@@ -115,17 +165,43 @@ function App() {
const stopRecording = async () => { const stopRecording = async () => {
try { try {
// Stop the timer // Stop the timers
if (recordingTimer.current) { if (recordingTimer.current) {
clearInterval(recordingTimer.current); clearInterval(recordingTimer.current);
recordingTimer.current = null; recordingTimer.current = null;
} }
if (transcriptionTimer.current) {
clearInterval(transcriptionTimer.current);
transcriptionTimer.current = null;
}
setAppState("transcribing"); setAppState("transcribing");
setStatusMessage("Transcribing audio..."); setStatusMessage("Processing final audio...");
const segments = await invoke<TranscriptSegment[]>("stop_recording"); // Process any remaining audio
setTranscriptSegments(segments); const finalChunk = await invoke<number[]>("get_remaining_audio", {
lastOffset: audioOffset.current
});
if (finalChunk.length > 0) {
const finalSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
audioData: finalChunk
});
if (finalSegments.length > 0) {
const baseTime = totalProcessedSamples.current / 16000;
const adjustedSegments = finalSegments.map(seg => ({
...seg,
start: seg.start + baseTime,
end: seg.end + baseTime,
}));
setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
}
}
// Stop the recording
await invoke("stop_recording");
setAppState("ready"); setAppState("ready");
setStatusMessage(""); setStatusMessage("");
@@ -264,6 +340,9 @@ function App() {
<div className="recording-indicator"> <div className="recording-indicator">
<span className="recording-dot" /> <span className="recording-dot" />
Recording: {formatDuration(recordingDuration)} Recording: {formatDuration(recordingDuration)}
{transcriptSegments.length > 0 && (
<span className="real-time-indicator"> (Real-time transcription active)</span>
)}
</div> </div>
<button className="stop-button" onClick={stopRecording}> <button className="stop-button" onClick={stopRecording}>
Stop Recording Stop Recording