feat: we successfully have the installer working for windows!
Security Scan and Upload / Security & DefectDojo Upload (pull_request) Successful in 2m25s
CI / Lint & Test (pull_request) Failing after 6m21s
CI / Build Linux (pull_request) Has been skipped
CI / Build Windows (cross-compile) (pull_request) Has been skipped

Models are downloaded at runtime instead of build.
This commit is contained in:
2026-01-28 17:15:13 -08:00
parent 3c8a46e5a6
commit 74c334c939
684 changed files with 431984 additions and 192 deletions
+215
View File
@@ -0,0 +1,215 @@
//! Voice Activity Detection for basic speaker separation.
//!
//! This module uses the Silero VAD model to detect speech segments
//! and provides basic speaker separation based on silence gaps.
use thiserror::Error;
use tracing::{debug, info};
use voice_activity_detector::VoiceActivityDetector;
use super::transcriber::TranscriptSegment;
/// Sample rate expected by the VAD model (16kHz)
pub const VAD_SAMPLE_RATE: u32 = 16000;
#[derive(Error, Debug)]
pub enum VadError {
#[error("Failed to create VAD: {0}")]
InitError(String),
#[error("VAD processing failed: {0}")]
ProcessingError(String),
}
/// A detected speech segment with timing information.
#[derive(Debug, Clone)]
pub struct SpeechSegment {
pub start_sample: usize,
pub end_sample: usize,
pub speaker_id: u32,
}
impl SpeechSegment {
/// Get the start time in seconds.
pub fn start_seconds(&self) -> f64 {
self.start_sample as f64 / VAD_SAMPLE_RATE as f64
}
/// Get the end time in seconds.
pub fn end_seconds(&self) -> f64 {
self.end_sample as f64 / VAD_SAMPLE_RATE as f64
}
/// Get the duration in seconds.
pub fn duration_seconds(&self) -> f64 {
self.end_seconds() - self.start_seconds()
}
}
/// Voice Activity Detector for speaker separation.
pub struct SpeakerSeparator {
vad: VoiceActivityDetector,
/// Minimum silence duration (in seconds) to consider a speaker change
min_silence_for_speaker_change: f64,
/// Minimum speech duration (in seconds) to consider a valid segment
min_speech_duration: f64,
}
impl SpeakerSeparator {
/// Create a new speaker separator with default settings.
pub fn new() -> Result<Self, VadError> {
Self::with_settings(1.5, 0.3)
}
/// Create a new speaker separator with custom settings.
///
/// # Arguments
/// * `min_silence_for_speaker_change` - Minimum silence duration (seconds) to consider a speaker change
/// * `min_speech_duration` - Minimum speech duration (seconds) to consider a valid segment
pub fn with_settings(
min_silence_for_speaker_change: f64,
min_speech_duration: f64,
) -> Result<Self, VadError> {
let vad = VoiceActivityDetector::builder()
.sample_rate(VAD_SAMPLE_RATE)
.chunk_size(512usize) // 512 samples for 16kHz
.build()
.map_err(|e: voice_activity_detector::Error| VadError::InitError(e.to_string()))?;
info!("VAD initialized with sample_rate={}, min_silence={}s, min_speech={}s",
VAD_SAMPLE_RATE, min_silence_for_speaker_change, min_speech_duration);
Ok(Self {
vad,
min_silence_for_speaker_change,
min_speech_duration,
})
}
/// Detect speech segments and assign speaker IDs based on silence gaps.
///
/// This is a simple heuristic: if there's a long enough silence gap,
/// we assume a different speaker might be talking.
pub fn detect_speakers(&mut self, audio: &[f32]) -> Result<Vec<SpeechSegment>, VadError> {
let chunk_size = 512;
let mut segments = Vec::new();
let mut current_segment_start: Option<usize> = None;
let mut last_speech_end: Option<usize> = None;
let mut current_speaker_id = 0u32;
let min_silence_samples = (self.min_silence_for_speaker_change * VAD_SAMPLE_RATE as f64) as usize;
let min_speech_samples = (self.min_speech_duration * VAD_SAMPLE_RATE as f64) as usize;
debug!("Processing {} samples for VAD", audio.len());
for (chunk_idx, chunk) in audio.chunks(chunk_size).enumerate() {
if chunk.len() < chunk_size {
// Skip incomplete chunks at the end
continue;
}
let sample_offset = chunk_idx * chunk_size;
// Predict voice activity for this chunk
let probability = self.vad.predict(chunk.iter().copied());
let is_speech = probability > 0.5;
if is_speech {
if current_segment_start.is_none() {
// Check if we should change speaker
if let Some(last_end) = last_speech_end {
let silence_duration = sample_offset - last_end;
if silence_duration >= min_silence_samples {
current_speaker_id += 1;
debug!("Speaker change detected at sample {} (silence: {}ms)",
sample_offset, silence_duration * 1000 / VAD_SAMPLE_RATE as usize);
}
}
current_segment_start = Some(sample_offset);
}
} else if let Some(start) = current_segment_start {
// Speech ended
let segment_duration = sample_offset - start;
if segment_duration >= min_speech_samples {
segments.push(SpeechSegment {
start_sample: start,
end_sample: sample_offset,
speaker_id: current_speaker_id,
});
}
last_speech_end = Some(sample_offset);
current_segment_start = None;
}
}
// Handle case where speech continues to the end
if let Some(start) = current_segment_start {
let segment_duration = audio.len() - start;
if segment_duration >= min_speech_samples {
segments.push(SpeechSegment {
start_sample: start,
end_sample: audio.len(),
speaker_id: current_speaker_id,
});
}
}
info!("Detected {} speech segments with {} speakers",
segments.len(),
segments.iter().map(|s| s.speaker_id).max().unwrap_or(0) + 1);
Ok(segments)
}
/// Apply speaker labels to transcript segments based on VAD results.
pub fn apply_speaker_labels(
&mut self,
audio: &[f32],
mut transcript: Vec<TranscriptSegment>,
) -> Result<Vec<TranscriptSegment>, VadError> {
let speech_segments = self.detect_speakers(audio)?;
for segment in &mut transcript {
// Find the speech segment that overlaps with this transcript segment
let segment_mid = (segment.start + segment.end) / 2.0;
for speech in &speech_segments {
let speech_start = speech.start_seconds();
let speech_end = speech.end_seconds();
if segment_mid >= speech_start && segment_mid <= speech_end {
segment.speaker = format!("Speaker {}", speech.speaker_id + 1);
break;
}
}
}
Ok(transcript)
}
}
impl Default for SpeakerSeparator {
fn default() -> Self {
Self::new().expect("Failed to create speaker separator")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_speech_segment_timing() {
let segment = SpeechSegment {
start_sample: 16000,
end_sample: 32000,
speaker_id: 0,
};
assert!((segment.start_seconds() - 1.0).abs() < 0.001);
assert!((segment.end_seconds() - 2.0).abs() < 0.001);
assert!((segment.duration_seconds() - 1.0).abs() < 0.001);
}
}