feat: we successfully have the installer working for windows!

Models are downloaded at runtime instead of build.
2026-01-28 17:15:13 -08:00
parent 3c8a46e5a6
commit 74c334c939
684 changed files with 431984 additions and 192 deletions
@@ -0,0 +1,215 @@
+//! Voice Activity Detection for basic speaker separation.
+//!
+//! This module uses the Silero VAD model to detect speech segments
+//! and provides basic speaker separation based on silence gaps.
+
+use thiserror::Error;
+use tracing::{debug, info};
+use voice_activity_detector::VoiceActivityDetector;
+
+use super::transcriber::TranscriptSegment;
+
+/// Sample rate expected by the VAD model (16kHz)
+pub const VAD_SAMPLE_RATE: u32 = 16000;
+
+#[derive(Error, Debug)]
+pub enum VadError {
+    #[error("Failed to create VAD: {0}")]
+    InitError(String),
+    #[error("VAD processing failed: {0}")]
+    ProcessingError(String),
+}
+
+/// A detected speech segment with timing information.
+#[derive(Debug, Clone)]
+pub struct SpeechSegment {
+    pub start_sample: usize,
+    pub end_sample: usize,
+    pub speaker_id: u32,
+}
+
+impl SpeechSegment {
+    /// Get the start time in seconds.
+    pub fn start_seconds(&self) -> f64 {
+        self.start_sample as f64 / VAD_SAMPLE_RATE as f64
+    }
+
+    /// Get the end time in seconds.
+    pub fn end_seconds(&self) -> f64 {
+        self.end_sample as f64 / VAD_SAMPLE_RATE as f64
+    }
+
+    /// Get the duration in seconds.
+    pub fn duration_seconds(&self) -> f64 {
+        self.end_seconds() - self.start_seconds()
+    }
+}
+
+/// Voice Activity Detector for speaker separation.
+pub struct SpeakerSeparator {
+    vad: VoiceActivityDetector,
+    /// Minimum silence duration (in seconds) to consider a speaker change
+    min_silence_for_speaker_change: f64,
+    /// Minimum speech duration (in seconds) to consider a valid segment
+    min_speech_duration: f64,
+}
+
+impl SpeakerSeparator {
+    /// Create a new speaker separator with default settings.
+    pub fn new() -> Result<Self, VadError> {
+        Self::with_settings(1.5, 0.3)
+    }
+
+    /// Create a new speaker separator with custom settings.
+    ///
+    /// # Arguments
+    /// * `min_silence_for_speaker_change` - Minimum silence duration (seconds) to consider a speaker change
+    /// * `min_speech_duration` - Minimum speech duration (seconds) to consider a valid segment
+    pub fn with_settings(
+        min_silence_for_speaker_change: f64,
+        min_speech_duration: f64,
+    ) -> Result<Self, VadError> {
+        let vad = VoiceActivityDetector::builder()
+            .sample_rate(VAD_SAMPLE_RATE)
+            .chunk_size(512usize) // 512 samples for 16kHz
+            .build()
+            .map_err(|e: voice_activity_detector::Error| VadError::InitError(e.to_string()))?;
+
+        info!("VAD initialized with sample_rate={}, min_silence={}s, min_speech={}s",
+              VAD_SAMPLE_RATE, min_silence_for_speaker_change, min_speech_duration);
+
+        Ok(Self {
+            vad,
+            min_silence_for_speaker_change,
+            min_speech_duration,
+        })
+    }
+
+    /// Detect speech segments and assign speaker IDs based on silence gaps.
+    ///
+    /// This is a simple heuristic: if there's a long enough silence gap,
+    /// we assume a different speaker might be talking.
+    pub fn detect_speakers(&mut self, audio: &[f32]) -> Result<Vec<SpeechSegment>, VadError> {
+        let chunk_size = 512;
+        let mut segments = Vec::new();
+        let mut current_segment_start: Option<usize> = None;
+        let mut last_speech_end: Option<usize> = None;
+        let mut current_speaker_id = 0u32;
+
+        let min_silence_samples = (self.min_silence_for_speaker_change * VAD_SAMPLE_RATE as f64) as usize;
+        let min_speech_samples = (self.min_speech_duration * VAD_SAMPLE_RATE as f64) as usize;
+
+        debug!("Processing {} samples for VAD", audio.len());
+
+        for (chunk_idx, chunk) in audio.chunks(chunk_size).enumerate() {
+            if chunk.len() < chunk_size {
+                // Skip incomplete chunks at the end
+                continue;
+            }
+
+            let sample_offset = chunk_idx * chunk_size;
+
+            // Predict voice activity for this chunk
+            let probability = self.vad.predict(chunk.iter().copied());
+
+            let is_speech = probability > 0.5;
+
+            if is_speech {
+                if current_segment_start.is_none() {
+                    // Check if we should change speaker
+                    if let Some(last_end) = last_speech_end {
+                        let silence_duration = sample_offset - last_end;
+                        if silence_duration >= min_silence_samples {
+                            current_speaker_id += 1;
+                            debug!("Speaker change detected at sample {} (silence: {}ms)",
+                                   sample_offset, silence_duration * 1000 / VAD_SAMPLE_RATE as usize);
+                        }
+                    }
+                    current_segment_start = Some(sample_offset);
+                }
+            } else if let Some(start) = current_segment_start {
+                // Speech ended
+                let segment_duration = sample_offset - start;
+
+                if segment_duration >= min_speech_samples {
+                    segments.push(SpeechSegment {
+                        start_sample: start,
+                        end_sample: sample_offset,
+                        speaker_id: current_speaker_id,
+                    });
+                }
+
+                last_speech_end = Some(sample_offset);
+                current_segment_start = None;
+            }
+        }
+
+        // Handle case where speech continues to the end
+        if let Some(start) = current_segment_start {
+            let segment_duration = audio.len() - start;
+            if segment_duration >= min_speech_samples {
+                segments.push(SpeechSegment {
+                    start_sample: start,
+                    end_sample: audio.len(),
+                    speaker_id: current_speaker_id,
+                });
+            }
+        }
+
+        info!("Detected {} speech segments with {} speakers",
+              segments.len(),
+              segments.iter().map(|s| s.speaker_id).max().unwrap_or(0) + 1);
+
+        Ok(segments)
+    }
+
+    /// Apply speaker labels to transcript segments based on VAD results.
+    pub fn apply_speaker_labels(
+        &mut self,
+        audio: &[f32],
+        mut transcript: Vec<TranscriptSegment>,
+    ) -> Result<Vec<TranscriptSegment>, VadError> {
+        let speech_segments = self.detect_speakers(audio)?;
+
+        for segment in &mut transcript {
+            // Find the speech segment that overlaps with this transcript segment
+            let segment_mid = (segment.start + segment.end) / 2.0;
+
+            for speech in &speech_segments {
+                let speech_start = speech.start_seconds();
+                let speech_end = speech.end_seconds();
+
+                if segment_mid >= speech_start && segment_mid <= speech_end {
+                    segment.speaker = format!("Speaker {}", speech.speaker_id + 1);
+                    break;
+                }
+            }
+        }
+
+        Ok(transcript)
+    }
+}
+
+impl Default for SpeakerSeparator {
+    fn default() -> Self {
+        Self::new().expect("Failed to create speaker separator")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_speech_segment_timing() {
+        let segment = SpeechSegment {
+            start_sample: 16000,
+            end_sample: 32000,
+            speaker_id: 0,
+        };
+
+        assert!((segment.start_seconds() - 1.0).abs() < 0.001);
+        assert!((segment.end_seconds() - 2.0).abs() < 0.001);
+        assert!((segment.duration_seconds() - 1.0).abs() < 0.001);
+    }
+}