generated from nhcarrigan/template
feat: handle transcription in real time
This commit is contained in:
@@ -387,6 +387,35 @@ async fn transcribe_chunk(
|
|||||||
Ok(segments)
|
Ok(segments)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the next chunk of audio for real-time transcription.
|
||||||
|
/// Returns the audio chunk and the new offset to use for the next call.
|
||||||
|
#[tauri::command]
|
||||||
|
async fn get_audio_chunk(
|
||||||
|
state: State<'_, AppState>,
|
||||||
|
last_offset: usize,
|
||||||
|
) -> Result<(Vec<f32>, usize), String> {
|
||||||
|
let audio_guard = state.audio_capture.lock();
|
||||||
|
if let Some(ref capture) = *audio_guard {
|
||||||
|
Ok(capture.extract_chunk(last_offset))
|
||||||
|
} else {
|
||||||
|
Err("No active recording".to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get remaining audio without modifying the buffer (for final processing).
|
||||||
|
#[tauri::command]
|
||||||
|
async fn get_remaining_audio(
|
||||||
|
state: State<'_, AppState>,
|
||||||
|
last_offset: usize,
|
||||||
|
) -> Result<Vec<f32>, String> {
|
||||||
|
let audio_guard = state.audio_capture.lock();
|
||||||
|
if let Some(ref capture) = *audio_guard {
|
||||||
|
Ok(capture.get_remaining_audio(last_offset))
|
||||||
|
} else {
|
||||||
|
Err("No active recording".to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Generate a summary from a transcript.
|
/// Generate a summary from a transcript.
|
||||||
#[tauri::command]
|
#[tauri::command]
|
||||||
async fn summarize(
|
async fn summarize(
|
||||||
@@ -449,6 +478,8 @@ pub fn run() {
|
|||||||
start_recording,
|
start_recording,
|
||||||
stop_recording,
|
stop_recording,
|
||||||
transcribe_chunk,
|
transcribe_chunk,
|
||||||
|
get_audio_chunk,
|
||||||
|
get_remaining_audio,
|
||||||
summarize,
|
summarize,
|
||||||
get_backend_logs,
|
get_backend_logs,
|
||||||
check_ready,
|
check_ready,
|
||||||
|
|||||||
@@ -196,6 +196,51 @@ impl AudioCapture {
|
|||||||
let sample_count = self.buffer.lock().len();
|
let sample_count = self.buffer.lock().len();
|
||||||
sample_count as f32 / WHISPER_SAMPLE_RATE as f32
|
sample_count as f32 / WHISPER_SAMPLE_RATE as f32
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extract audio chunk for real-time processing.
|
||||||
|
/// This method retrieves audio starting from the given offset and returns only
|
||||||
|
/// the new samples, keeping a small overlap for context.
|
||||||
|
pub fn extract_chunk(&self, from_sample: usize) -> (Vec<f32>, usize) {
|
||||||
|
let mut buffer = self.buffer.lock();
|
||||||
|
let current_len = buffer.len();
|
||||||
|
|
||||||
|
// If we don't have enough new samples, return empty
|
||||||
|
if from_sample >= current_len {
|
||||||
|
return (Vec::new(), current_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract new samples
|
||||||
|
let chunk: Vec<f32> = buffer[from_sample..].to_vec();
|
||||||
|
|
||||||
|
// Keep only the last 30 seconds of audio (at 16kHz) to prevent memory exhaustion
|
||||||
|
// This provides enough overlap for context while limiting memory usage
|
||||||
|
const MAX_BUFFER_SECONDS: usize = 30;
|
||||||
|
const MAX_BUFFER_SAMPLES: usize = WHISPER_SAMPLE_RATE as usize * MAX_BUFFER_SECONDS;
|
||||||
|
|
||||||
|
if buffer.len() > MAX_BUFFER_SAMPLES {
|
||||||
|
// Calculate how many samples to remove from the beginning
|
||||||
|
let samples_to_remove = buffer.len() - MAX_BUFFER_SAMPLES;
|
||||||
|
buffer.drain(..samples_to_remove);
|
||||||
|
|
||||||
|
// Return the chunk and adjust the offset
|
||||||
|
return (chunk, current_len - samples_to_remove);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the chunk and the new offset
|
||||||
|
(chunk, current_len)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get all audio samples from the given offset without modifying the buffer.
|
||||||
|
/// This is used when stopping recording to get any remaining audio.
|
||||||
|
pub fn get_remaining_audio(&self, from_sample: usize) -> Vec<f32> {
|
||||||
|
let buffer = self.buffer.lock();
|
||||||
|
|
||||||
|
if from_sample >= buffer.len() {
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer[from_sample..].to_vec()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for AudioCapture {
|
impl Default for AudioCapture {
|
||||||
|
|||||||
@@ -175,6 +175,12 @@ body {
|
|||||||
animation: pulse 1.5s infinite;
|
animation: pulse 1.5s infinite;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.real-time-indicator {
|
||||||
|
font-size: 0.875rem;
|
||||||
|
color: var(--secondary-color);
|
||||||
|
font-style: italic;
|
||||||
|
}
|
||||||
|
|
||||||
.recording-duration {
|
.recording-duration {
|
||||||
font-weight: 600;
|
font-weight: 600;
|
||||||
font-variant-numeric: tabular-nums;
|
font-variant-numeric: tabular-nums;
|
||||||
|
|||||||
+84
-5
@@ -25,6 +25,9 @@ function App() {
|
|||||||
const [recordingDuration, setRecordingDuration] = useState(0);
|
const [recordingDuration, setRecordingDuration] = useState(0);
|
||||||
const initStarted = useRef(false);
|
const initStarted = useRef(false);
|
||||||
const recordingTimer = useRef<number | null>(null);
|
const recordingTimer = useRef<number | null>(null);
|
||||||
|
const transcriptionTimer = useRef<number | null>(null);
|
||||||
|
const audioOffset = useRef(0);
|
||||||
|
const totalProcessedSamples = useRef(0);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (initStarted.current) return;
|
if (initStarted.current) return;
|
||||||
@@ -33,12 +36,15 @@ function App() {
|
|||||||
initializeApp();
|
initializeApp();
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
// Cleanup timer on unmount
|
// Cleanup timers on unmount
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
return () => {
|
return () => {
|
||||||
if (recordingTimer.current) {
|
if (recordingTimer.current) {
|
||||||
clearInterval(recordingTimer.current);
|
clearInterval(recordingTimer.current);
|
||||||
}
|
}
|
||||||
|
if (transcriptionTimer.current) {
|
||||||
|
clearInterval(transcriptionTimer.current);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
@@ -93,12 +99,51 @@ function App() {
|
|||||||
initializeApp();
|
initializeApp();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const processAudioChunk = async () => {
|
||||||
|
try {
|
||||||
|
// Get the next chunk of audio
|
||||||
|
const [audioChunk, newOffset] = await invoke<[number[], number]>("get_audio_chunk", {
|
||||||
|
lastOffset: audioOffset.current
|
||||||
|
});
|
||||||
|
|
||||||
|
// If we have enough audio (at least 5 seconds worth at 16kHz)
|
||||||
|
if (audioChunk.length >= 5 * 16000) {
|
||||||
|
// Transcribe the chunk
|
||||||
|
const newSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
|
||||||
|
audioData: audioChunk
|
||||||
|
});
|
||||||
|
|
||||||
|
if (newSegments.length > 0) {
|
||||||
|
// Calculate timestamps based on total processed samples
|
||||||
|
const baseTime = totalProcessedSamples.current / 16000;
|
||||||
|
const adjustedSegments = newSegments.map(seg => ({
|
||||||
|
...seg,
|
||||||
|
start: seg.start + baseTime,
|
||||||
|
end: seg.end + baseTime,
|
||||||
|
}));
|
||||||
|
|
||||||
|
setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track total processed samples
|
||||||
|
totalProcessedSamples.current += audioChunk.length;
|
||||||
|
|
||||||
|
// Update the offset for next time
|
||||||
|
audioOffset.current = newOffset;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Failed to process audio chunk:", error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const startRecording = async () => {
|
const startRecording = async () => {
|
||||||
try {
|
try {
|
||||||
setAppState("recording");
|
setAppState("recording");
|
||||||
setRecordingDuration(0);
|
setRecordingDuration(0);
|
||||||
setTranscriptSegments([]);
|
setTranscriptSegments([]);
|
||||||
setSummary(null);
|
setSummary(null);
|
||||||
|
audioOffset.current = 0;
|
||||||
|
totalProcessedSamples.current = 0;
|
||||||
|
|
||||||
await invoke("start_recording");
|
await invoke("start_recording");
|
||||||
|
|
||||||
@@ -106,6 +151,11 @@ function App() {
|
|||||||
recordingTimer.current = window.setInterval(() => {
|
recordingTimer.current = window.setInterval(() => {
|
||||||
setRecordingDuration(d => d + 1);
|
setRecordingDuration(d => d + 1);
|
||||||
}, 1000);
|
}, 1000);
|
||||||
|
|
||||||
|
// Start real-time transcription timer (every 5 seconds)
|
||||||
|
transcriptionTimer.current = window.setInterval(() => {
|
||||||
|
processAudioChunk();
|
||||||
|
}, 5000);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Failed to start recording:", error);
|
console.error("Failed to start recording:", error);
|
||||||
setAppState("ready");
|
setAppState("ready");
|
||||||
@@ -115,17 +165,43 @@ function App() {
|
|||||||
|
|
||||||
const stopRecording = async () => {
|
const stopRecording = async () => {
|
||||||
try {
|
try {
|
||||||
// Stop the timer
|
// Stop the timers
|
||||||
if (recordingTimer.current) {
|
if (recordingTimer.current) {
|
||||||
clearInterval(recordingTimer.current);
|
clearInterval(recordingTimer.current);
|
||||||
recordingTimer.current = null;
|
recordingTimer.current = null;
|
||||||
}
|
}
|
||||||
|
if (transcriptionTimer.current) {
|
||||||
|
clearInterval(transcriptionTimer.current);
|
||||||
|
transcriptionTimer.current = null;
|
||||||
|
}
|
||||||
|
|
||||||
setAppState("transcribing");
|
setAppState("transcribing");
|
||||||
setStatusMessage("Transcribing audio...");
|
setStatusMessage("Processing final audio...");
|
||||||
|
|
||||||
const segments = await invoke<TranscriptSegment[]>("stop_recording");
|
// Process any remaining audio
|
||||||
setTranscriptSegments(segments);
|
const finalChunk = await invoke<number[]>("get_remaining_audio", {
|
||||||
|
lastOffset: audioOffset.current
|
||||||
|
});
|
||||||
|
|
||||||
|
if (finalChunk.length > 0) {
|
||||||
|
const finalSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
|
||||||
|
audioData: finalChunk
|
||||||
|
});
|
||||||
|
|
||||||
|
if (finalSegments.length > 0) {
|
||||||
|
const baseTime = totalProcessedSamples.current / 16000;
|
||||||
|
const adjustedSegments = finalSegments.map(seg => ({
|
||||||
|
...seg,
|
||||||
|
start: seg.start + baseTime,
|
||||||
|
end: seg.end + baseTime,
|
||||||
|
}));
|
||||||
|
|
||||||
|
setTranscriptSegments(prev => [...prev, ...adjustedSegments]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop the recording
|
||||||
|
await invoke("stop_recording");
|
||||||
|
|
||||||
setAppState("ready");
|
setAppState("ready");
|
||||||
setStatusMessage("");
|
setStatusMessage("");
|
||||||
@@ -264,6 +340,9 @@ function App() {
|
|||||||
<div className="recording-indicator">
|
<div className="recording-indicator">
|
||||||
<span className="recording-dot" />
|
<span className="recording-dot" />
|
||||||
Recording: {formatDuration(recordingDuration)}
|
Recording: {formatDuration(recordingDuration)}
|
||||||
|
{transcriptSegments.length > 0 && (
|
||||||
|
<span className="real-time-indicator"> (Real-time transcription active)</span>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
<button className="stop-button" onClick={stopRecording}>
|
<button className="stop-button" onClick={stopRecording}>
|
||||||
⏹️ Stop Recording
|
⏹️ Stop Recording
|
||||||
|
|||||||
Reference in New Issue
Block a user