generated from nhcarrigan/template
feat: handle transcription in real time
This commit is contained in:
@@ -387,6 +387,35 @@ async fn transcribe_chunk(
|
||||
Ok(segments)
|
||||
}
|
||||
|
||||
/// Get the next chunk of audio for real-time transcription.
|
||||
/// Returns the audio chunk and the new offset to use for the next call.
|
||||
#[tauri::command]
|
||||
async fn get_audio_chunk(
|
||||
state: State<'_, AppState>,
|
||||
last_offset: usize,
|
||||
) -> Result<(Vec<f32>, usize), String> {
|
||||
let audio_guard = state.audio_capture.lock();
|
||||
if let Some(ref capture) = *audio_guard {
|
||||
Ok(capture.extract_chunk(last_offset))
|
||||
} else {
|
||||
Err("No active recording".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Get remaining audio without modifying the buffer (for final processing).
|
||||
#[tauri::command]
|
||||
async fn get_remaining_audio(
|
||||
state: State<'_, AppState>,
|
||||
last_offset: usize,
|
||||
) -> Result<Vec<f32>, String> {
|
||||
let audio_guard = state.audio_capture.lock();
|
||||
if let Some(ref capture) = *audio_guard {
|
||||
Ok(capture.get_remaining_audio(last_offset))
|
||||
} else {
|
||||
Err("No active recording".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a summary from a transcript.
|
||||
#[tauri::command]
|
||||
async fn summarize(
|
||||
@@ -449,6 +478,8 @@ pub fn run() {
|
||||
start_recording,
|
||||
stop_recording,
|
||||
transcribe_chunk,
|
||||
get_audio_chunk,
|
||||
get_remaining_audio,
|
||||
summarize,
|
||||
get_backend_logs,
|
||||
check_ready,
|
||||
|
||||
@@ -196,6 +196,51 @@ impl AudioCapture {
|
||||
let sample_count = self.buffer.lock().len();
|
||||
sample_count as f32 / WHISPER_SAMPLE_RATE as f32
|
||||
}
|
||||
|
||||
/// Extract audio chunk for real-time processing.
|
||||
/// This method retrieves audio starting from the given offset and returns only
|
||||
/// the new samples, keeping a small overlap for context.
|
||||
pub fn extract_chunk(&self, from_sample: usize) -> (Vec<f32>, usize) {
|
||||
let mut buffer = self.buffer.lock();
|
||||
let current_len = buffer.len();
|
||||
|
||||
// If we don't have enough new samples, return empty
|
||||
if from_sample >= current_len {
|
||||
return (Vec::new(), current_len);
|
||||
}
|
||||
|
||||
// Extract new samples
|
||||
let chunk: Vec<f32> = buffer[from_sample..].to_vec();
|
||||
|
||||
// Keep only the last 30 seconds of audio (at 16kHz) to prevent memory exhaustion
|
||||
// This provides enough overlap for context while limiting memory usage
|
||||
const MAX_BUFFER_SECONDS: usize = 30;
|
||||
const MAX_BUFFER_SAMPLES: usize = WHISPER_SAMPLE_RATE as usize * MAX_BUFFER_SECONDS;
|
||||
|
||||
if buffer.len() > MAX_BUFFER_SAMPLES {
|
||||
// Calculate how many samples to remove from the beginning
|
||||
let samples_to_remove = buffer.len() - MAX_BUFFER_SAMPLES;
|
||||
buffer.drain(..samples_to_remove);
|
||||
|
||||
// Return the chunk and adjust the offset
|
||||
return (chunk, current_len - samples_to_remove);
|
||||
}
|
||||
|
||||
// Return the chunk and the new offset
|
||||
(chunk, current_len)
|
||||
}
|
||||
|
||||
/// Get all audio samples from the given offset without modifying the buffer.
|
||||
/// This is used when stopping recording to get any remaining audio.
|
||||
pub fn get_remaining_audio(&self, from_sample: usize) -> Vec<f32> {
|
||||
let buffer = self.buffer.lock();
|
||||
|
||||
if from_sample >= buffer.len() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
buffer[from_sample..].to_vec()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AudioCapture {
|
||||
|
||||
@@ -175,6 +175,12 @@ body {
|
||||
animation: pulse 1.5s infinite;
|
||||
}
|
||||
|
||||
.real-time-indicator {
|
||||
font-size: 0.875rem;
|
||||
color: var(--secondary-color);
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
.recording-duration {
|
||||
font-weight: 600;
|
||||
font-variant-numeric: tabular-nums;
|
||||
|
||||
+84
-5
@@ -25,6 +25,9 @@ function App() {
|
||||
const [recordingDuration, setRecordingDuration] = useState(0);
|
||||
const initStarted = useRef(false);
|
||||
const recordingTimer = useRef<number | null>(null);
|
||||
const transcriptionTimer = useRef<number | null>(null);
|
||||
const audioOffset = useRef(0);
|
||||
const totalProcessedSamples = useRef(0);
|
||||
|
||||
useEffect(() => {
|
||||
if (initStarted.current) return;
|
||||
@@ -33,12 +36,15 @@ function App() {
|
||||
initializeApp();
|
||||
}, []);
|
||||
|
||||
// Cleanup timer on unmount
|
||||
// Cleanup timers on unmount
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
if (recordingTimer.current) {
|
||||
clearInterval(recordingTimer.current);
|
||||
}
|
||||
if (transcriptionTimer.current) {
|
||||
clearInterval(transcriptionTimer.current);
|
||||
}
|
||||
};
|
||||
}, []);
|
||||
|
||||
@@ -93,12 +99,51 @@ function App() {
|
||||
initializeApp();
|
||||
};
|
||||
|
||||
const processAudioChunk = async () => {
|
||||
try {
|
||||
// Get the next chunk of audio
|
||||
const [audioChunk, newOffset] = await invoke<[number[], number]>("get_audio_chunk", {
|
||||
lastOffset: audioOffset.current
|
||||
});
|
||||
|
||||
// If we have enough audio (at least 5 seconds worth at 16kHz)
|
||||
if (audioChunk.length >= 5 * 16000) {
|
||||
// Transcribe the chunk
|
||||
const newSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
|
||||
audioData: audioChunk
|
||||
});
|
||||
|
||||
if (newSegments.length > 0) {
|
||||
// Calculate timestamps based on total processed samples
|
||||
const baseTime = totalProcessedSamples.current / 16000;
|
||||
const adjustedSegments = newSegments.map(seg => ({
|
||||
...seg,
|
||||
start: seg.start + baseTime,
|
||||
end: seg.end + baseTime,
|
||||
}));
|
||||
|
||||
setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
|
||||
}
|
||||
|
||||
// Track total processed samples
|
||||
totalProcessedSamples.current += audioChunk.length;
|
||||
|
||||
// Update the offset for next time
|
||||
audioOffset.current = newOffset;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to process audio chunk:", error);
|
||||
}
|
||||
};
|
||||
|
||||
const startRecording = async () => {
|
||||
try {
|
||||
setAppState("recording");
|
||||
setRecordingDuration(0);
|
||||
setTranscriptSegments([]);
|
||||
setSummary(null);
|
||||
audioOffset.current = 0;
|
||||
totalProcessedSamples.current = 0;
|
||||
|
||||
await invoke("start_recording");
|
||||
|
||||
@@ -106,6 +151,11 @@ function App() {
|
||||
recordingTimer.current = window.setInterval(() => {
|
||||
setRecordingDuration(d => d + 1);
|
||||
}, 1000);
|
||||
|
||||
// Start real-time transcription timer (every 5 seconds)
|
||||
transcriptionTimer.current = window.setInterval(() => {
|
||||
processAudioChunk();
|
||||
}, 5000);
|
||||
} catch (error) {
|
||||
console.error("Failed to start recording:", error);
|
||||
setAppState("ready");
|
||||
@@ -115,17 +165,43 @@ function App() {
|
||||
|
||||
const stopRecording = async () => {
|
||||
try {
|
||||
// Stop the timer
|
||||
// Stop the timers
|
||||
if (recordingTimer.current) {
|
||||
clearInterval(recordingTimer.current);
|
||||
recordingTimer.current = null;
|
||||
}
|
||||
if (transcriptionTimer.current) {
|
||||
clearInterval(transcriptionTimer.current);
|
||||
transcriptionTimer.current = null;
|
||||
}
|
||||
|
||||
setAppState("transcribing");
|
||||
setStatusMessage("Transcribing audio...");
|
||||
setStatusMessage("Processing final audio...");
|
||||
|
||||
const segments = await invoke<TranscriptSegment[]>("stop_recording");
|
||||
setTranscriptSegments(segments);
|
||||
// Process any remaining audio
|
||||
const finalChunk = await invoke<number[]>("get_remaining_audio", {
|
||||
lastOffset: audioOffset.current
|
||||
});
|
||||
|
||||
if (finalChunk.length > 0) {
|
||||
const finalSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
|
||||
audioData: finalChunk
|
||||
});
|
||||
|
||||
if (finalSegments.length > 0) {
|
||||
const baseTime = totalProcessedSamples.current / 16000;
|
||||
const adjustedSegments = finalSegments.map(seg => ({
|
||||
...seg,
|
||||
start: seg.start + baseTime,
|
||||
end: seg.end + baseTime,
|
||||
}));
|
||||
|
||||
setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
|
||||
}
|
||||
}
|
||||
|
||||
// Stop the recording
|
||||
await invoke("stop_recording");
|
||||
|
||||
setAppState("ready");
|
||||
setStatusMessage("");
|
||||
@@ -264,6 +340,9 @@ function App() {
|
||||
<div className="recording-indicator">
|
||||
<span className="recording-dot" />
|
||||
Recording: {formatDuration(recordingDuration)}
|
||||
{transcriptSegments.length > 0 && (
|
||||
<span className="real-time-indicator"> (Real-time transcription active)</span>
|
||||
)}
|
||||
</div>
|
||||
<button className="stop-button" onClick={stopRecording}>
|
||||
⏹️ Stop Recording
|
||||
|
||||
Reference in New Issue
Block a user