From 4ed7462a1787a1e44909cda2129d271394357cd9 Mon Sep 17 00:00:00 2001
From: Naomi Carrigan <commits@nhcarrigan.com>
Date: Thu, 29 Jan 2026 15:34:04 -0800
Subject: [PATCH] feat: diarisation maybe

---
 src-tauri/Cargo.lock    | 156 ++++++++++++++++++++-
 src-tauri/Cargo.toml    |   4 +
 src-tauri/src/lib.rs    |  68 ++-------
 src-tauri/src/ml/vad.rs | 303 +++++++++++++++++++++++++++++++++-------
 src/App.tsx             |   8 +-
 5 files changed, 432 insertions(+), 107 deletions(-)

diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock
index e1b4aca..ee047d4 100644
--- a/src-tauri/Cargo.lock
+++ b/src-tauri/Cargo.lock
@@ -519,11 +519,13 @@ name = "chronara"
 version = "0.1.0"
 dependencies = [
  "cpal",
+ "dasp",
  "futures-util",
  "hound",
  "llama-cpp-2",
  "parking_lot",
  "reqwest",
+ "rustfft",
  "serde",
  "serde_json",
  "tauri",
@@ -813,12 +815,125 @@ dependencies = [
  "syn 2.0.114",
 ]
 
+[[package]]
+name = "dasp"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7381b67da416b639690ac77c73b86a7b5e64a29e31d1f75fb3b1102301ef355a"
+dependencies = [
+ "dasp_envelope",
+ "dasp_frame",
+ "dasp_interpolate",
+ "dasp_peak",
+ "dasp_ring_buffer",
+ "dasp_rms",
+ "dasp_sample",
+ "dasp_signal",
+ "dasp_slice",
+ "dasp_window",
+]
+
+[[package]]
+name = "dasp_envelope"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ec617ce7016f101a87fe85ed44180839744265fae73bb4aa43e7ece1b7668b6"
+dependencies = [
+ "dasp_frame",
+ "dasp_peak",
+ "dasp_ring_buffer",
+ "dasp_rms",
+ "dasp_sample",
+]
+
+[[package]]
+name = "dasp_frame"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2a3937f5fe2135702897535c8d4a5553f8b116f76c1529088797f2eee7c5cd6"
+dependencies = [
+ "dasp_sample",
+]
+
+[[package]]
+name = "dasp_interpolate"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fc975a6563bb7ca7ec0a6c784ead49983a21c24835b0bc96eea11ee407c7486"
+dependencies = [
+ "dasp_frame",
+ "dasp_ring_buffer",
+ "dasp_sample",
+]
+
+[[package]]
+name = "dasp_peak"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cf88559d79c21f3d8523d91250c397f9a15b5fc72fbb3f87fdb0a37b79915bf"
+dependencies = [
+ "dasp_frame",
+ "dasp_sample",
+]
+
+[[package]]
+name = "dasp_ring_buffer"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07d79e19b89618a543c4adec9c5a347fe378a19041699b3278e616e387511ea1"
+
+[[package]]
+name = "dasp_rms"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6c5dcb30b7e5014486e2822537ea2beae50b19722ffe2ed7549ab03774575aa"
+dependencies = [
+ "dasp_frame",
+ "dasp_ring_buffer",
+ "dasp_sample",
+]
+
 [[package]]
 name = "dasp_sample"
 version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c87e182de0887fd5361989c677c4e8f5000cd9491d6d563161a8f3a5519fc7f"
 
+[[package]]
+name = "dasp_signal"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa1ab7d01689c6ed4eae3d38fe1cea08cba761573fbd2d592528d55b421077e7"
+dependencies = [
+ "dasp_envelope",
+ "dasp_frame",
+ "dasp_interpolate",
+ "dasp_peak",
+ "dasp_ring_buffer",
+ "dasp_rms",
+ "dasp_sample",
+ "dasp_window",
+]
+
+[[package]]
+name = "dasp_slice"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e1c7335d58e7baedafa516cb361360ff38d6f4d3f9d9d5ee2a2fc8e27178fa1"
+dependencies = [
+ "dasp_frame",
+ "dasp_sample",
+]
+
+[[package]]
+name = "dasp_window"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99ded7b88821d2ce4e8b842c9f1c86ac911891ab89443cc1de750cae764c5076"
+dependencies = [
+ "dasp_sample",
+]
+
 [[package]]
 name = "der"
 version = "0.7.10"
@@ -3222,6 +3337,15 @@ dependencies = [
  "syn 2.0.114",
 ]
 
+[[package]]
+name = "primal-check"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc0d895b311e3af9902528fbb8f928688abbd95872819320517cc24ca6b2bd08"
+dependencies = [
+ "num-integer",
+]
+
 [[package]]
 name = "proc-macro-crate"
 version = "1.3.1"
@@ -3351,7 +3475,7 @@ dependencies = [
  "once_cell",
  "socket2",
  "tracing",
- "windows-sys 0.59.0",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -3639,6 +3763,20 @@ dependencies = [
  "semver",
 ]
 
+[[package]]
+name = "rustfft"
+version = "6.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89"
+dependencies = [
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "primal-check",
+ "strength_reduce",
+ "transpose",
+]
+
 [[package]]
 name = "rustix"
 version = "1.1.3"
@@ -4142,6 +4280,12 @@ version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
 
+[[package]]
+name = "strength_reduce"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82"
+
 [[package]]
 name = "string_cache"
 version = "0.8.9"
@@ -4945,6 +5089,16 @@ dependencies = [
  "tracing-log",
 ]
 
+[[package]]
+name = "transpose"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e"
+dependencies = [
+ "num-integer",
+ "strength_reduce",
+]
+
 [[package]]
 name = "tray-icon"
 version = "0.21.3"
diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml
index 31f292f..dc23932 100644
--- a/src-tauri/Cargo.toml
+++ b/src-tauri/Cargo.toml
@@ -34,6 +34,10 @@ hound = "3.5"  # WAV file handling
 # Voice activity detection
 voice_activity_detector = "0.2"
 
+# Audio analysis for speaker detection
+dasp = "0.11"  # Digital audio signal processing
+rustfft = "6.2"  # FFT for frequency analysis
+
 # Async runtime
 tokio = { version = "1", features = ["full"] }
 
diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs
index accf071..4af32fc 100644
--- a/src-tauri/src/lib.rs
+++ b/src-tauri/src/lib.rs
@@ -277,6 +277,7 @@ async fn initialize_models(
         }
     }
 
+
     // Initialize storage
     emit_log(&app_handle, &logs, "[Init] Initializing recording storage...");
     if let Ok(app_data_dir) = app_handle.path().app_data_dir() {
@@ -359,62 +360,6 @@ async fn stop_recording(
     Ok("Recording stopped".to_string())
 }
 
-/// Stop recording and transcribe all at once (batch mode).
-#[tauri::command]
-async fn stop_recording_batch(
-    state: State<'_, AppState>,
-    app_handle: tauri::AppHandle,
-) -> Result<Vec<TranscriptSegment>, String> {
-    let logs = Arc::clone(&state.logs);
-
-    emit_log(&app_handle, &logs, "[Audio] Stopping recording (batch mode)...");
-
-    // Get the audio samples
-    let audio_samples = {
-        let mut audio_guard = state.audio_capture.lock();
-        if let Some(ref mut capture) = *audio_guard {
-            capture.stop_recording()
-        } else {
-            return Err("No active recording".to_string());
-        }
-    };
-
-    let duration = audio_samples.len() as f32 / 16000.0;
-    emit_log(&app_handle, &logs, &format!("[Audio] Captured {:.1}s of audio", duration));
-
-    if audio_samples.is_empty() {
-        return Err("No audio captured".to_string());
-    }
-
-    // Transcribe the audio
-    emit_log(&app_handle, &logs, "[Transcribe] Starting transcription...");
-
-    let app_handle_clone = app_handle.clone();
-    let mut segments = {
-        let transcriber = state.transcriber.lock();
-        if !transcriber.is_loaded() {
-            emit_log(&app_handle, &logs, "[Transcribe ERROR] Whisper model not loaded");
-            return Err("Whisper model not loaded. Please ensure the model is downloaded.".to_string());
-        }
-
-        transcriber.transcribe_with_progress(&audio_samples, move |progress| {
-            // Emit progress event to frontend
-            let _ = app_handle_clone.emit("transcription-progress", progress);
-        })
-            .map_err(|e| format!("Transcription failed: {}", e))?
-    };
-
-    emit_log(&app_handle, &logs, &format!("[Transcribe] Got {} segments", segments.len()));
-
-    // Apply speaker labels using VAD
-    if let Some(ref mut separator) = *state.speaker_separator.lock() {
-        emit_log(&app_handle, &logs, "[Speaker] Applying speaker labels...");
-        segments = separator.apply_speaker_labels(&audio_samples, segments)
-            .map_err(|e| format!("Speaker separation failed: {}", e))?;
-    }
-
-    Ok(segments)
-}
 
 /// Transcribe a chunk of audio (for real-time transcription).
 #[tauri::command]
@@ -422,6 +367,7 @@ async fn transcribe_chunk(
     state: State<'_, AppState>,
     audio_data: Vec<f32>,
     app_handle: tauri::AppHandle,
+    chunk_start_time: f64,
 ) -> Result<Vec<TranscriptSegment>, String> {
     let transcriber = state.transcriber.lock();
 
@@ -432,12 +378,20 @@ async fn transcribe_chunk(
     // Clone the app handle for the closure
     let app_handle_clone = app_handle.clone();
 
-    let segments = transcriber.transcribe_with_progress(&audio_data, move |progress| {
+    let mut segments = transcriber.transcribe_with_progress(&audio_data, move |progress| {
         // Emit progress event to frontend
         let _ = app_handle_clone.emit("transcription-progress", progress);
     })
         .map_err(|e| format!("Transcription failed: {}", e))?;
 
+    // Apply speaker detection using enhanced VAD
+    let mut separator_guard = state.speaker_separator.lock();
+    if let Some(ref mut separator) = *separator_guard {
+        // Process this chunk for speaker detection
+        segments = separator.apply_speaker_labels_to_chunk(&audio_data, segments, chunk_start_time)
+            .map_err(|e| format!("Speaker detection failed: {}", e))?;
+    }
+
     Ok(segments)
 }
 
diff --git a/src-tauri/src/ml/vad.rs b/src-tauri/src/ml/vad.rs
index e2d8815..bcd76dd 100644
--- a/src-tauri/src/ml/vad.rs
+++ b/src-tauri/src/ml/vad.rs
@@ -1,10 +1,12 @@
-//! Voice Activity Detection for basic speaker separation.
+//! Enhanced Voice Activity Detection for speaker separation in group meetings.
 //!
-//! This module uses the Silero VAD model to detect speech segments
-//! and provides basic speaker separation based on silence gaps.
+//! This module uses the Silero VAD model combined with voice characteristics
+//! to provide improved speaker separation for meetings with multiple speakers.
 
+use rustfft::{FftPlanner, num_complex::Complex};
+use std::collections::HashMap;
 use thiserror::Error;
-use tracing::{debug, info};
+use tracing::{debug, info, warn};
 use voice_activity_detector::VoiceActivityDetector;
 
 use super::transcriber::TranscriptSegment;
@@ -20,12 +22,41 @@ pub enum VadError {
     ProcessingError(String),
 }
 
-/// A detected speech segment with timing information.
+/// Voice characteristics for speaker fingerprinting.
+#[derive(Debug, Clone)]
+pub struct VoiceFingerprint {
+    /// Average pitch (fundamental frequency) in Hz
+    pub avg_pitch: f32,
+    /// Pitch variance
+    pub pitch_variance: f32,
+    /// Average energy level
+    pub avg_energy: f32,
+    /// Spectral centroid (brightness indicator)
+    pub spectral_centroid: f32,
+}
+
+impl VoiceFingerprint {
+    /// Calculate similarity between two voice fingerprints (0.0 to 1.0)
+    pub fn similarity(&self, other: &VoiceFingerprint) -> f32 {
+        // Normalize differences for each feature
+        let pitch_diff = (self.avg_pitch - other.avg_pitch).abs() / 200.0; // Normalize by typical pitch range
+        let variance_diff = (self.pitch_variance - other.pitch_variance).abs() / 50.0;
+        let energy_diff = (self.avg_energy - other.avg_energy).abs() / 0.5;
+        let centroid_diff = (self.spectral_centroid - other.spectral_centroid).abs() / 1000.0;
+
+        // Calculate weighted similarity
+        let diff = pitch_diff * 0.4 + variance_diff * 0.2 + energy_diff * 0.2 + centroid_diff * 0.2;
+        (1.0 - diff.min(1.0)).max(0.0)
+    }
+}
+
+/// A detected speech segment with timing information and voice characteristics.
 #[derive(Debug, Clone)]
 pub struct SpeechSegment {
     pub start_sample: usize,
     pub end_sample: usize,
     pub speaker_id: u32,
+    pub fingerprint: Option<VoiceFingerprint>,
 }
 
 impl SpeechSegment {
@@ -45,19 +76,27 @@ impl SpeechSegment {
     }
 }
 
-/// Voice Activity Detector for speaker separation.
+/// Voice Activity Detector for speaker separation with enhanced tracking.
 pub struct SpeakerSeparator {
     vad: VoiceActivityDetector,
     /// Minimum silence duration (in seconds) to consider a speaker change
-    min_silence_for_speaker_change: f64,
+    _min_silence_for_speaker_change: f64,
     /// Minimum speech duration (in seconds) to consider a valid segment
     min_speech_duration: f64,
+    /// Known speakers and their voice fingerprints
+    known_speakers: HashMap<u32, VoiceFingerprint>,
+    /// Next speaker ID to assign
+    next_speaker_id: u32,
+    /// Similarity threshold for matching speakers (0.0 to 1.0)
+    similarity_threshold: f32,
+    /// FFT planner for frequency analysis
+    fft_planner: FftPlanner<f32>,
 }
 
 impl SpeakerSeparator {
     /// Create a new speaker separator with default settings.
     pub fn new() -> Result<Self, VadError> {
-        Self::with_settings(1.5, 0.3)
+        Self::with_settings(0.8, 0.3) // Reduced silence threshold for group meetings
     }
 
     /// Create a new speaker separator with custom settings.
@@ -75,87 +114,212 @@ impl SpeakerSeparator {
             .build()
             .map_err(|e: voice_activity_detector::Error| VadError::InitError(e.to_string()))?;
 
-        info!("VAD initialized with sample_rate={}, min_silence={}s, min_speech={}s",
+        info!("Enhanced VAD initialized with sample_rate={}, min_silence={}s, min_speech={}s",
               VAD_SAMPLE_RATE, min_silence_for_speaker_change, min_speech_duration);
 
         Ok(Self {
             vad,
-            min_silence_for_speaker_change,
+            _min_silence_for_speaker_change: min_silence_for_speaker_change,
             min_speech_duration,
+            known_speakers: HashMap::new(),
+            next_speaker_id: 0,
+            similarity_threshold: 0.7,
+            fft_planner: FftPlanner::new(),
         })
     }
 
-    /// Detect speech segments and assign speaker IDs based on silence gaps.
+    /// Extract voice fingerprint from an audio segment.
+    fn extract_fingerprint(&mut self, audio: &[f32]) -> Option<VoiceFingerprint> {
+        if audio.len() < 2048 {
+            return None;
+        }
+
+        // Calculate average energy
+        let avg_energy = audio.iter().map(|s| s.abs()).sum::<f32>() / audio.len() as f32;
+
+        // Estimate pitch using autocorrelation (simplified)
+        let pitch = self.estimate_pitch(audio);
+
+        // Calculate spectral centroid
+        let spectral_centroid = self.calculate_spectral_centroid(audio);
+
+        // Calculate pitch variance over windows
+        let pitch_variance = self.calculate_pitch_variance(audio);
+
+        Some(VoiceFingerprint {
+            avg_pitch: pitch,
+            pitch_variance,
+            avg_energy,
+            spectral_centroid,
+        })
+    }
+
+    /// Estimate pitch using zero-crossing rate (simplified approach).
+    fn estimate_pitch(&self, audio: &[f32]) -> f32 {
+        let mut zero_crossings = 0;
+        let mut prev_sign = audio[0] >= 0.0;
+
+        for sample in audio.iter().skip(1) {
+            let current_sign = *sample >= 0.0;
+            if current_sign != prev_sign {
+                zero_crossings += 1;
+            }
+            prev_sign = current_sign;
+        }
+
+        // Convert zero-crossing rate to approximate frequency
+        let zcr = zero_crossings as f32 / audio.len() as f32;
+        zcr * VAD_SAMPLE_RATE as f32 / 2.0
+    }
+
+    /// Calculate spectral centroid (center of mass of spectrum).
+    fn calculate_spectral_centroid(&mut self, audio: &[f32]) -> f32 {
+        let fft_size = 2048.min(audio.len());
+        let mut input: Vec<Complex<f32>> = audio[..fft_size]
+            .iter()
+            .map(|&s| Complex::new(s, 0.0))
+            .collect();
+
+        let fft = self.fft_planner.plan_fft_forward(fft_size);
+        fft.process(&mut input);
+
+        let mut weighted_sum = 0.0;
+        let mut magnitude_sum = 0.0;
+
+        for (i, complex) in input.iter().enumerate().take(fft_size / 2) {
+            let magnitude = complex.norm();
+            let frequency = i as f32 * VAD_SAMPLE_RATE as f32 / fft_size as f32;
+            weighted_sum += frequency * magnitude;
+            magnitude_sum += magnitude;
+        }
+
+        if magnitude_sum > 0.0 {
+            weighted_sum / magnitude_sum
+        } else {
+            0.0
+        }
+    }
+
+    /// Calculate pitch variance across multiple windows.
+    fn calculate_pitch_variance(&self, audio: &[f32]) -> f32 {
+        let window_size = 1024;
+        let num_windows = audio.len() / window_size;
+
+        if num_windows < 2 {
+            return 0.0;
+        }
+
+        let mut pitches = Vec::new();
+        for i in 0..num_windows {
+            let start = i * window_size;
+            let end = (i + 1) * window_size;
+            let pitch = self.estimate_pitch(&audio[start..end]);
+            pitches.push(pitch);
+        }
+
+        // Calculate variance
+        let mean = pitches.iter().sum::<f32>() / pitches.len() as f32;
+        let variance = pitches.iter()
+            .map(|p| (p - mean).powi(2))
+            .sum::<f32>() / pitches.len() as f32;
+
+        variance.sqrt()
+    }
+
+    /// Find or assign speaker ID based on voice fingerprint.
+    fn find_or_assign_speaker(&mut self, fingerprint: &VoiceFingerprint) -> u32 {
+        // Try to match with known speakers
+        let mut best_match = None;
+        let mut best_similarity = 0.0;
+
+        for (speaker_id, known_fp) in &self.known_speakers {
+            let similarity = fingerprint.similarity(known_fp);
+            if similarity > best_similarity && similarity >= self.similarity_threshold {
+                best_similarity = similarity;
+                best_match = Some(*speaker_id);
+            }
+        }
+
+        if let Some(speaker_id) = best_match {
+            debug!("Matched to existing speaker {} with similarity {:.2}", speaker_id, best_similarity);
+            speaker_id
+        } else {
+            // New speaker
+            let speaker_id = self.next_speaker_id;
+            self.next_speaker_id += 1;
+            self.known_speakers.insert(speaker_id, fingerprint.clone());
+            debug!("New speaker {} detected", speaker_id);
+            speaker_id
+        }
+    }
+
+    /// Detect speech segments and assign speaker IDs based on voice characteristics.
     ///
-    /// This is a simple heuristic: if there's a long enough silence gap,
-    /// we assume a different speaker might be talking.
+    /// This enhanced version uses voice fingerprinting to track speakers across
+    /// overlapping speech and short interruptions, making it suitable for group meetings.
     pub fn detect_speakers(&mut self, audio: &[f32]) -> Result<Vec<SpeechSegment>, VadError> {
         let chunk_size = 512;
-        let mut segments = Vec::new();
+        let mut raw_segments = Vec::new();
         let mut current_segment_start: Option<usize> = None;
-        let mut last_speech_end: Option<usize> = None;
-        let mut current_speaker_id = 0u32;
-
-        let min_silence_samples = (self.min_silence_for_speaker_change * VAD_SAMPLE_RATE as f64) as usize;
         let min_speech_samples = (self.min_speech_duration * VAD_SAMPLE_RATE as f64) as usize;
 
-        debug!("Processing {} samples for VAD", audio.len());
+        debug!("Processing {} samples for enhanced VAD", audio.len());
 
+        // First pass: detect raw speech segments
         for (chunk_idx, chunk) in audio.chunks(chunk_size).enumerate() {
             if chunk.len() < chunk_size {
-                // Skip incomplete chunks at the end
                 continue;
             }
 
             let sample_offset = chunk_idx * chunk_size;
-
-            // Predict voice activity for this chunk
             let probability = self.vad.predict(chunk.iter().copied());
-
             let is_speech = probability > 0.5;
 
             if is_speech {
                 if current_segment_start.is_none() {
-                    // Check if we should change speaker
-                    if let Some(last_end) = last_speech_end {
-                        let silence_duration = sample_offset - last_end;
-                        if silence_duration >= min_silence_samples {
-                            current_speaker_id += 1;
-                            debug!("Speaker change detected at sample {} (silence: {}ms)",
-                                   sample_offset, silence_duration * 1000 / VAD_SAMPLE_RATE as usize);
-                        }
-                    }
                     current_segment_start = Some(sample_offset);
                 }
             } else if let Some(start) = current_segment_start {
-                // Speech ended
                 let segment_duration = sample_offset - start;
-
                 if segment_duration >= min_speech_samples {
-                    segments.push(SpeechSegment {
-                        start_sample: start,
-                        end_sample: sample_offset,
-                        speaker_id: current_speaker_id,
-                    });
+                    raw_segments.push((start, sample_offset));
                 }
-
-                last_speech_end = Some(sample_offset);
                 current_segment_start = None;
             }
         }
 
-        // Handle case where speech continues to the end
+        // Handle speech continuing to the end
         if let Some(start) = current_segment_start {
             let segment_duration = audio.len() - start;
             if segment_duration >= min_speech_samples {
-                segments.push(SpeechSegment {
-                    start_sample: start,
-                    end_sample: audio.len(),
-                    speaker_id: current_speaker_id,
-                });
+                raw_segments.push((start, audio.len()));
             }
         }
 
+        // Second pass: analyze voice characteristics and assign speakers
+        let mut segments = Vec::new();
+        for (start, end) in raw_segments {
+            let segment_audio = &audio[start..end];
+
+            // Extract voice fingerprint
+            let fingerprint = self.extract_fingerprint(segment_audio);
+
+            let speaker_id = if let Some(ref fp) = fingerprint {
+                self.find_or_assign_speaker(fp)
+            } else {
+                // Fallback: assign based on position
+                warn!("Could not extract fingerprint for segment at {}s", start as f64 / VAD_SAMPLE_RATE as f64);
+                self.next_speaker_id - 1 // Use last assigned speaker
+            };
+
+            segments.push(SpeechSegment {
+                start_sample: start,
+                end_sample: end,
+                speaker_id,
+                fingerprint,
+            });
+        }
+
         info!("Detected {} speech segments with {} speakers",
               segments.len(),
               segments.iter().map(|s| s.speaker_id).max().unwrap_or(0) + 1);
@@ -188,6 +352,51 @@ impl SpeakerSeparator {
 
         Ok(transcript)
     }
+
+    /// Apply speaker labels to a chunk of transcript segments (for real-time processing).
+    ///
+    /// This method is designed to work with the real-time transcription system,
+    /// maintaining speaker consistency across chunks.
+    pub fn apply_speaker_labels_to_chunk(
+        &mut self,
+        audio: &[f32],
+        mut transcript: Vec<TranscriptSegment>,
+        _chunk_start_time: f64,
+    ) -> Result<Vec<TranscriptSegment>, VadError> {
+        // Detect speakers in this chunk
+        let speech_segments = self.detect_speakers(audio)?;
+
+        for segment in &mut transcript {
+            // Adjust times relative to chunk start
+            let segment_mid = (segment.start + segment.end) / 2.0;
+
+            // Find matching speech segment
+            for speech in &speech_segments {
+                let speech_start = speech.start_seconds();
+                let speech_end = speech.end_seconds();
+
+                if segment_mid >= speech_start && segment_mid <= speech_end {
+                    // Use consistent speaker numbering across chunks
+                    segment.speaker = format!("Speaker {}", speech.speaker_id + 1);
+                    break;
+                }
+            }
+        }
+
+        // Clean up old speakers if we have too many (helps with memory in long meetings)
+        if self.known_speakers.len() > 10 {
+            warn!("Many speakers detected ({}), consider adjusting similarity threshold", self.known_speakers.len());
+        }
+
+        Ok(transcript)
+    }
+
+    /// Reset speaker tracking (useful between different meetings).
+    pub fn reset_speakers(&mut self) {
+        self.known_speakers.clear();
+        self.next_speaker_id = 0;
+        info!("Speaker tracking reset");
+    }
 }
 
 impl Default for SpeakerSeparator {
diff --git a/src/App.tsx b/src/App.tsx
index 5ea4e88..fc7b583 100644
--- a/src/App.tsx
+++ b/src/App.tsx
@@ -202,8 +202,10 @@ function App() {
       // If we have enough audio (at least 5 seconds worth at 16kHz)
       if (audioChunk.length >= 5 * 16000) {
         // Transcribe the chunk
+        const chunkStartTime = totalProcessedSamples.current / 16000;
         const newSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
-          audioData: audioChunk
+          audioData: audioChunk,
+          chunkStartTime: chunkStartTime
         });
 
         if (newSegments.length > 0) {
@@ -293,9 +295,11 @@ function App() {
 
         if (finalChunk.length > 0) {
           console.log(`Processing final chunk of ${finalChunk.length} samples`);
+          const chunkStartTime = totalProcessedSamples.current / 16000;
           // The progress will be updated via events from the backend
           const finalSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
-            audioData: finalChunk
+            audioData: finalChunk,
+            chunkStartTime: chunkStartTime
           });
 
           if (finalSegments.length > 0) {