From 4ed7462a1787a1e44909cda2129d271394357cd9 Mon Sep 17 00:00:00 2001 From: Naomi Carrigan Date: Thu, 29 Jan 2026 15:34:04 -0800 Subject: [PATCH] feat: diarisation maybe --- src-tauri/Cargo.lock | 156 ++++++++++++++++++++- src-tauri/Cargo.toml | 4 + src-tauri/src/lib.rs | 68 ++------- src-tauri/src/ml/vad.rs | 303 +++++++++++++++++++++++++++++++++------- src/App.tsx | 8 +- 5 files changed, 432 insertions(+), 107 deletions(-) diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index e1b4aca..ee047d4 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -519,11 +519,13 @@ name = "chronara" version = "0.1.0" dependencies = [ "cpal", + "dasp", "futures-util", "hound", "llama-cpp-2", "parking_lot", "reqwest", + "rustfft", "serde", "serde_json", "tauri", @@ -813,12 +815,125 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "dasp" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7381b67da416b639690ac77c73b86a7b5e64a29e31d1f75fb3b1102301ef355a" +dependencies = [ + "dasp_envelope", + "dasp_frame", + "dasp_interpolate", + "dasp_peak", + "dasp_ring_buffer", + "dasp_rms", + "dasp_sample", + "dasp_signal", + "dasp_slice", + "dasp_window", +] + +[[package]] +name = "dasp_envelope" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ec617ce7016f101a87fe85ed44180839744265fae73bb4aa43e7ece1b7668b6" +dependencies = [ + "dasp_frame", + "dasp_peak", + "dasp_ring_buffer", + "dasp_rms", + "dasp_sample", +] + +[[package]] +name = "dasp_frame" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a3937f5fe2135702897535c8d4a5553f8b116f76c1529088797f2eee7c5cd6" +dependencies = [ + "dasp_sample", +] + +[[package]] +name = "dasp_interpolate" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fc975a6563bb7ca7ec0a6c784ead49983a21c24835b0bc96eea11ee407c7486" +dependencies = [ + "dasp_frame", + "dasp_ring_buffer", + "dasp_sample", +] + +[[package]] +name = "dasp_peak" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cf88559d79c21f3d8523d91250c397f9a15b5fc72fbb3f87fdb0a37b79915bf" +dependencies = [ + "dasp_frame", + "dasp_sample", +] + +[[package]] +name = "dasp_ring_buffer" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07d79e19b89618a543c4adec9c5a347fe378a19041699b3278e616e387511ea1" + +[[package]] +name = "dasp_rms" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6c5dcb30b7e5014486e2822537ea2beae50b19722ffe2ed7549ab03774575aa" +dependencies = [ + "dasp_frame", + "dasp_ring_buffer", + "dasp_sample", +] + [[package]] name = "dasp_sample" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c87e182de0887fd5361989c677c4e8f5000cd9491d6d563161a8f3a5519fc7f" +[[package]] +name = "dasp_signal" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa1ab7d01689c6ed4eae3d38fe1cea08cba761573fbd2d592528d55b421077e7" +dependencies = [ + "dasp_envelope", + "dasp_frame", + "dasp_interpolate", + "dasp_peak", + "dasp_ring_buffer", + "dasp_rms", + "dasp_sample", + "dasp_window", +] + +[[package]] +name = "dasp_slice" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e1c7335d58e7baedafa516cb361360ff38d6f4d3f9d9d5ee2a2fc8e27178fa1" +dependencies = [ + "dasp_frame", + "dasp_sample", +] + +[[package]] +name = "dasp_window" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99ded7b88821d2ce4e8b842c9f1c86ac911891ab89443cc1de750cae764c5076" +dependencies = [ + "dasp_sample", +] + [[package]] name = "der" version = "0.7.10" @@ -3222,6 +3337,15 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "primal-check" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc0d895b311e3af9902528fbb8f928688abbd95872819320517cc24ca6b2bd08" +dependencies = [ + "num-integer", +] + [[package]] name = "proc-macro-crate" version = "1.3.1" @@ -3351,7 +3475,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -3639,6 +3763,20 @@ dependencies = [ "semver", ] +[[package]] +name = "rustfft" +version = "6.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89" +dependencies = [ + "num-complex", + "num-integer", + "num-traits", + "primal-check", + "strength_reduce", + "transpose", +] + [[package]] name = "rustix" version = "1.1.3" @@ -4142,6 +4280,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + [[package]] name = "string_cache" version = "0.8.9" @@ -4945,6 +5089,16 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "transpose" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e" +dependencies = [ + "num-integer", + "strength_reduce", +] + [[package]] name = "tray-icon" version = "0.21.3" diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 31f292f..dc23932 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -34,6 +34,10 @@ hound = "3.5" # WAV file handling # Voice activity detection voice_activity_detector = "0.2" +# Audio analysis for speaker detection +dasp = "0.11" # Digital audio signal processing +rustfft = "6.2" # FFT for frequency analysis + # Async runtime tokio = { version = "1", features = ["full"] } diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index accf071..4af32fc 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -277,6 +277,7 @@ async fn initialize_models( } } + // Initialize storage emit_log(&app_handle, &logs, "[Init] Initializing recording storage..."); if let Ok(app_data_dir) = app_handle.path().app_data_dir() { @@ -359,62 +360,6 @@ async fn stop_recording( Ok("Recording stopped".to_string()) } -/// Stop recording and transcribe all at once (batch mode). -#[tauri::command] -async fn stop_recording_batch( - state: State<'_, AppState>, - app_handle: tauri::AppHandle, -) -> Result, String> { - let logs = Arc::clone(&state.logs); - - emit_log(&app_handle, &logs, "[Audio] Stopping recording (batch mode)..."); - - // Get the audio samples - let audio_samples = { - let mut audio_guard = state.audio_capture.lock(); - if let Some(ref mut capture) = *audio_guard { - capture.stop_recording() - } else { - return Err("No active recording".to_string()); - } - }; - - let duration = audio_samples.len() as f32 / 16000.0; - emit_log(&app_handle, &logs, &format!("[Audio] Captured {:.1}s of audio", duration)); - - if audio_samples.is_empty() { - return Err("No audio captured".to_string()); - } - - // Transcribe the audio - emit_log(&app_handle, &logs, "[Transcribe] Starting transcription..."); - - let app_handle_clone = app_handle.clone(); - let mut segments = { - let transcriber = state.transcriber.lock(); - if !transcriber.is_loaded() { - emit_log(&app_handle, &logs, "[Transcribe ERROR] Whisper model not loaded"); - return Err("Whisper model not loaded. Please ensure the model is downloaded.".to_string()); - } - - transcriber.transcribe_with_progress(&audio_samples, move |progress| { - // Emit progress event to frontend - let _ = app_handle_clone.emit("transcription-progress", progress); - }) - .map_err(|e| format!("Transcription failed: {}", e))? - }; - - emit_log(&app_handle, &logs, &format!("[Transcribe] Got {} segments", segments.len())); - - // Apply speaker labels using VAD - if let Some(ref mut separator) = *state.speaker_separator.lock() { - emit_log(&app_handle, &logs, "[Speaker] Applying speaker labels..."); - segments = separator.apply_speaker_labels(&audio_samples, segments) - .map_err(|e| format!("Speaker separation failed: {}", e))?; - } - - Ok(segments) -} /// Transcribe a chunk of audio (for real-time transcription). #[tauri::command] @@ -422,6 +367,7 @@ async fn transcribe_chunk( state: State<'_, AppState>, audio_data: Vec, app_handle: tauri::AppHandle, + chunk_start_time: f64, ) -> Result, String> { let transcriber = state.transcriber.lock(); @@ -432,12 +378,20 @@ async fn transcribe_chunk( // Clone the app handle for the closure let app_handle_clone = app_handle.clone(); - let segments = transcriber.transcribe_with_progress(&audio_data, move |progress| { + let mut segments = transcriber.transcribe_with_progress(&audio_data, move |progress| { // Emit progress event to frontend let _ = app_handle_clone.emit("transcription-progress", progress); }) .map_err(|e| format!("Transcription failed: {}", e))?; + // Apply speaker detection using enhanced VAD + let mut separator_guard = state.speaker_separator.lock(); + if let Some(ref mut separator) = *separator_guard { + // Process this chunk for speaker detection + segments = separator.apply_speaker_labels_to_chunk(&audio_data, segments, chunk_start_time) + .map_err(|e| format!("Speaker detection failed: {}", e))?; + } + Ok(segments) } diff --git a/src-tauri/src/ml/vad.rs b/src-tauri/src/ml/vad.rs index e2d8815..bcd76dd 100644 --- a/src-tauri/src/ml/vad.rs +++ b/src-tauri/src/ml/vad.rs @@ -1,10 +1,12 @@ -//! Voice Activity Detection for basic speaker separation. +//! Enhanced Voice Activity Detection for speaker separation in group meetings. //! -//! This module uses the Silero VAD model to detect speech segments -//! and provides basic speaker separation based on silence gaps. +//! This module uses the Silero VAD model combined with voice characteristics +//! to provide improved speaker separation for meetings with multiple speakers. +use rustfft::{FftPlanner, num_complex::Complex}; +use std::collections::HashMap; use thiserror::Error; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; use voice_activity_detector::VoiceActivityDetector; use super::transcriber::TranscriptSegment; @@ -20,12 +22,41 @@ pub enum VadError { ProcessingError(String), } -/// A detected speech segment with timing information. +/// Voice characteristics for speaker fingerprinting. +#[derive(Debug, Clone)] +pub struct VoiceFingerprint { + /// Average pitch (fundamental frequency) in Hz + pub avg_pitch: f32, + /// Pitch variance + pub pitch_variance: f32, + /// Average energy level + pub avg_energy: f32, + /// Spectral centroid (brightness indicator) + pub spectral_centroid: f32, +} + +impl VoiceFingerprint { + /// Calculate similarity between two voice fingerprints (0.0 to 1.0) + pub fn similarity(&self, other: &VoiceFingerprint) -> f32 { + // Normalize differences for each feature + let pitch_diff = (self.avg_pitch - other.avg_pitch).abs() / 200.0; // Normalize by typical pitch range + let variance_diff = (self.pitch_variance - other.pitch_variance).abs() / 50.0; + let energy_diff = (self.avg_energy - other.avg_energy).abs() / 0.5; + let centroid_diff = (self.spectral_centroid - other.spectral_centroid).abs() / 1000.0; + + // Calculate weighted similarity + let diff = pitch_diff * 0.4 + variance_diff * 0.2 + energy_diff * 0.2 + centroid_diff * 0.2; + (1.0 - diff.min(1.0)).max(0.0) + } +} + +/// A detected speech segment with timing information and voice characteristics. #[derive(Debug, Clone)] pub struct SpeechSegment { pub start_sample: usize, pub end_sample: usize, pub speaker_id: u32, + pub fingerprint: Option, } impl SpeechSegment { @@ -45,19 +76,27 @@ impl SpeechSegment { } } -/// Voice Activity Detector for speaker separation. +/// Voice Activity Detector for speaker separation with enhanced tracking. pub struct SpeakerSeparator { vad: VoiceActivityDetector, /// Minimum silence duration (in seconds) to consider a speaker change - min_silence_for_speaker_change: f64, + _min_silence_for_speaker_change: f64, /// Minimum speech duration (in seconds) to consider a valid segment min_speech_duration: f64, + /// Known speakers and their voice fingerprints + known_speakers: HashMap, + /// Next speaker ID to assign + next_speaker_id: u32, + /// Similarity threshold for matching speakers (0.0 to 1.0) + similarity_threshold: f32, + /// FFT planner for frequency analysis + fft_planner: FftPlanner, } impl SpeakerSeparator { /// Create a new speaker separator with default settings. pub fn new() -> Result { - Self::with_settings(1.5, 0.3) + Self::with_settings(0.8, 0.3) // Reduced silence threshold for group meetings } /// Create a new speaker separator with custom settings. @@ -75,87 +114,212 @@ impl SpeakerSeparator { .build() .map_err(|e: voice_activity_detector::Error| VadError::InitError(e.to_string()))?; - info!("VAD initialized with sample_rate={}, min_silence={}s, min_speech={}s", + info!("Enhanced VAD initialized with sample_rate={}, min_silence={}s, min_speech={}s", VAD_SAMPLE_RATE, min_silence_for_speaker_change, min_speech_duration); Ok(Self { vad, - min_silence_for_speaker_change, + _min_silence_for_speaker_change: min_silence_for_speaker_change, min_speech_duration, + known_speakers: HashMap::new(), + next_speaker_id: 0, + similarity_threshold: 0.7, + fft_planner: FftPlanner::new(), }) } - /// Detect speech segments and assign speaker IDs based on silence gaps. + /// Extract voice fingerprint from an audio segment. + fn extract_fingerprint(&mut self, audio: &[f32]) -> Option { + if audio.len() < 2048 { + return None; + } + + // Calculate average energy + let avg_energy = audio.iter().map(|s| s.abs()).sum::() / audio.len() as f32; + + // Estimate pitch using autocorrelation (simplified) + let pitch = self.estimate_pitch(audio); + + // Calculate spectral centroid + let spectral_centroid = self.calculate_spectral_centroid(audio); + + // Calculate pitch variance over windows + let pitch_variance = self.calculate_pitch_variance(audio); + + Some(VoiceFingerprint { + avg_pitch: pitch, + pitch_variance, + avg_energy, + spectral_centroid, + }) + } + + /// Estimate pitch using zero-crossing rate (simplified approach). + fn estimate_pitch(&self, audio: &[f32]) -> f32 { + let mut zero_crossings = 0; + let mut prev_sign = audio[0] >= 0.0; + + for sample in audio.iter().skip(1) { + let current_sign = *sample >= 0.0; + if current_sign != prev_sign { + zero_crossings += 1; + } + prev_sign = current_sign; + } + + // Convert zero-crossing rate to approximate frequency + let zcr = zero_crossings as f32 / audio.len() as f32; + zcr * VAD_SAMPLE_RATE as f32 / 2.0 + } + + /// Calculate spectral centroid (center of mass of spectrum). + fn calculate_spectral_centroid(&mut self, audio: &[f32]) -> f32 { + let fft_size = 2048.min(audio.len()); + let mut input: Vec> = audio[..fft_size] + .iter() + .map(|&s| Complex::new(s, 0.0)) + .collect(); + + let fft = self.fft_planner.plan_fft_forward(fft_size); + fft.process(&mut input); + + let mut weighted_sum = 0.0; + let mut magnitude_sum = 0.0; + + for (i, complex) in input.iter().enumerate().take(fft_size / 2) { + let magnitude = complex.norm(); + let frequency = i as f32 * VAD_SAMPLE_RATE as f32 / fft_size as f32; + weighted_sum += frequency * magnitude; + magnitude_sum += magnitude; + } + + if magnitude_sum > 0.0 { + weighted_sum / magnitude_sum + } else { + 0.0 + } + } + + /// Calculate pitch variance across multiple windows. + fn calculate_pitch_variance(&self, audio: &[f32]) -> f32 { + let window_size = 1024; + let num_windows = audio.len() / window_size; + + if num_windows < 2 { + return 0.0; + } + + let mut pitches = Vec::new(); + for i in 0..num_windows { + let start = i * window_size; + let end = (i + 1) * window_size; + let pitch = self.estimate_pitch(&audio[start..end]); + pitches.push(pitch); + } + + // Calculate variance + let mean = pitches.iter().sum::() / pitches.len() as f32; + let variance = pitches.iter() + .map(|p| (p - mean).powi(2)) + .sum::() / pitches.len() as f32; + + variance.sqrt() + } + + /// Find or assign speaker ID based on voice fingerprint. + fn find_or_assign_speaker(&mut self, fingerprint: &VoiceFingerprint) -> u32 { + // Try to match with known speakers + let mut best_match = None; + let mut best_similarity = 0.0; + + for (speaker_id, known_fp) in &self.known_speakers { + let similarity = fingerprint.similarity(known_fp); + if similarity > best_similarity && similarity >= self.similarity_threshold { + best_similarity = similarity; + best_match = Some(*speaker_id); + } + } + + if let Some(speaker_id) = best_match { + debug!("Matched to existing speaker {} with similarity {:.2}", speaker_id, best_similarity); + speaker_id + } else { + // New speaker + let speaker_id = self.next_speaker_id; + self.next_speaker_id += 1; + self.known_speakers.insert(speaker_id, fingerprint.clone()); + debug!("New speaker {} detected", speaker_id); + speaker_id + } + } + + /// Detect speech segments and assign speaker IDs based on voice characteristics. /// - /// This is a simple heuristic: if there's a long enough silence gap, - /// we assume a different speaker might be talking. + /// This enhanced version uses voice fingerprinting to track speakers across + /// overlapping speech and short interruptions, making it suitable for group meetings. pub fn detect_speakers(&mut self, audio: &[f32]) -> Result, VadError> { let chunk_size = 512; - let mut segments = Vec::new(); + let mut raw_segments = Vec::new(); let mut current_segment_start: Option = None; - let mut last_speech_end: Option = None; - let mut current_speaker_id = 0u32; - - let min_silence_samples = (self.min_silence_for_speaker_change * VAD_SAMPLE_RATE as f64) as usize; let min_speech_samples = (self.min_speech_duration * VAD_SAMPLE_RATE as f64) as usize; - debug!("Processing {} samples for VAD", audio.len()); + debug!("Processing {} samples for enhanced VAD", audio.len()); + // First pass: detect raw speech segments for (chunk_idx, chunk) in audio.chunks(chunk_size).enumerate() { if chunk.len() < chunk_size { - // Skip incomplete chunks at the end continue; } let sample_offset = chunk_idx * chunk_size; - - // Predict voice activity for this chunk let probability = self.vad.predict(chunk.iter().copied()); - let is_speech = probability > 0.5; if is_speech { if current_segment_start.is_none() { - // Check if we should change speaker - if let Some(last_end) = last_speech_end { - let silence_duration = sample_offset - last_end; - if silence_duration >= min_silence_samples { - current_speaker_id += 1; - debug!("Speaker change detected at sample {} (silence: {}ms)", - sample_offset, silence_duration * 1000 / VAD_SAMPLE_RATE as usize); - } - } current_segment_start = Some(sample_offset); } } else if let Some(start) = current_segment_start { - // Speech ended let segment_duration = sample_offset - start; - if segment_duration >= min_speech_samples { - segments.push(SpeechSegment { - start_sample: start, - end_sample: sample_offset, - speaker_id: current_speaker_id, - }); + raw_segments.push((start, sample_offset)); } - - last_speech_end = Some(sample_offset); current_segment_start = None; } } - // Handle case where speech continues to the end + // Handle speech continuing to the end if let Some(start) = current_segment_start { let segment_duration = audio.len() - start; if segment_duration >= min_speech_samples { - segments.push(SpeechSegment { - start_sample: start, - end_sample: audio.len(), - speaker_id: current_speaker_id, - }); + raw_segments.push((start, audio.len())); } } + // Second pass: analyze voice characteristics and assign speakers + let mut segments = Vec::new(); + for (start, end) in raw_segments { + let segment_audio = &audio[start..end]; + + // Extract voice fingerprint + let fingerprint = self.extract_fingerprint(segment_audio); + + let speaker_id = if let Some(ref fp) = fingerprint { + self.find_or_assign_speaker(fp) + } else { + // Fallback: assign based on position + warn!("Could not extract fingerprint for segment at {}s", start as f64 / VAD_SAMPLE_RATE as f64); + self.next_speaker_id - 1 // Use last assigned speaker + }; + + segments.push(SpeechSegment { + start_sample: start, + end_sample: end, + speaker_id, + fingerprint, + }); + } + info!("Detected {} speech segments with {} speakers", segments.len(), segments.iter().map(|s| s.speaker_id).max().unwrap_or(0) + 1); @@ -188,6 +352,51 @@ impl SpeakerSeparator { Ok(transcript) } + + /// Apply speaker labels to a chunk of transcript segments (for real-time processing). + /// + /// This method is designed to work with the real-time transcription system, + /// maintaining speaker consistency across chunks. + pub fn apply_speaker_labels_to_chunk( + &mut self, + audio: &[f32], + mut transcript: Vec, + _chunk_start_time: f64, + ) -> Result, VadError> { + // Detect speakers in this chunk + let speech_segments = self.detect_speakers(audio)?; + + for segment in &mut transcript { + // Adjust times relative to chunk start + let segment_mid = (segment.start + segment.end) / 2.0; + + // Find matching speech segment + for speech in &speech_segments { + let speech_start = speech.start_seconds(); + let speech_end = speech.end_seconds(); + + if segment_mid >= speech_start && segment_mid <= speech_end { + // Use consistent speaker numbering across chunks + segment.speaker = format!("Speaker {}", speech.speaker_id + 1); + break; + } + } + } + + // Clean up old speakers if we have too many (helps with memory in long meetings) + if self.known_speakers.len() > 10 { + warn!("Many speakers detected ({}), consider adjusting similarity threshold", self.known_speakers.len()); + } + + Ok(transcript) + } + + /// Reset speaker tracking (useful between different meetings). + pub fn reset_speakers(&mut self) { + self.known_speakers.clear(); + self.next_speaker_id = 0; + info!("Speaker tracking reset"); + } } impl Default for SpeakerSeparator { diff --git a/src/App.tsx b/src/App.tsx index 5ea4e88..fc7b583 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -202,8 +202,10 @@ function App() { // If we have enough audio (at least 5 seconds worth at 16kHz) if (audioChunk.length >= 5 * 16000) { // Transcribe the chunk + const chunkStartTime = totalProcessedSamples.current / 16000; const newSegments = await invoke("transcribe_chunk", { - audioData: audioChunk + audioData: audioChunk, + chunkStartTime: chunkStartTime }); if (newSegments.length > 0) { @@ -293,9 +295,11 @@ function App() { if (finalChunk.length > 0) { console.log(`Processing final chunk of ${finalChunk.length} samples`); + const chunkStartTime = totalProcessedSamples.current / 16000; // The progress will be updated via events from the backend const finalSegments = await invoke("transcribe_chunk", { - audioData: finalChunk + audioData: finalChunk, + chunkStartTime: chunkStartTime }); if (finalSegments.length > 0) {