generated from nhcarrigan/template
feat: we successfully have the installer working for windows!
Models are downloaded at runtime instead of build.
This commit is contained in:
@@ -0,0 +1,215 @@
|
||||
//! Voice Activity Detection for basic speaker separation.
|
||||
//!
|
||||
//! This module uses the Silero VAD model to detect speech segments
|
||||
//! and provides basic speaker separation based on silence gaps.
|
||||
|
||||
use thiserror::Error;
|
||||
use tracing::{debug, info};
|
||||
use voice_activity_detector::VoiceActivityDetector;
|
||||
|
||||
use super::transcriber::TranscriptSegment;
|
||||
|
||||
/// Sample rate expected by the VAD model (16kHz)
|
||||
pub const VAD_SAMPLE_RATE: u32 = 16000;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum VadError {
|
||||
#[error("Failed to create VAD: {0}")]
|
||||
InitError(String),
|
||||
#[error("VAD processing failed: {0}")]
|
||||
ProcessingError(String),
|
||||
}
|
||||
|
||||
/// A detected speech segment with timing information.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SpeechSegment {
|
||||
pub start_sample: usize,
|
||||
pub end_sample: usize,
|
||||
pub speaker_id: u32,
|
||||
}
|
||||
|
||||
impl SpeechSegment {
|
||||
/// Get the start time in seconds.
|
||||
pub fn start_seconds(&self) -> f64 {
|
||||
self.start_sample as f64 / VAD_SAMPLE_RATE as f64
|
||||
}
|
||||
|
||||
/// Get the end time in seconds.
|
||||
pub fn end_seconds(&self) -> f64 {
|
||||
self.end_sample as f64 / VAD_SAMPLE_RATE as f64
|
||||
}
|
||||
|
||||
/// Get the duration in seconds.
|
||||
pub fn duration_seconds(&self) -> f64 {
|
||||
self.end_seconds() - self.start_seconds()
|
||||
}
|
||||
}
|
||||
|
||||
/// Voice Activity Detector for speaker separation.
|
||||
pub struct SpeakerSeparator {
|
||||
vad: VoiceActivityDetector,
|
||||
/// Minimum silence duration (in seconds) to consider a speaker change
|
||||
min_silence_for_speaker_change: f64,
|
||||
/// Minimum speech duration (in seconds) to consider a valid segment
|
||||
min_speech_duration: f64,
|
||||
}
|
||||
|
||||
impl SpeakerSeparator {
|
||||
/// Create a new speaker separator with default settings.
|
||||
pub fn new() -> Result<Self, VadError> {
|
||||
Self::with_settings(1.5, 0.3)
|
||||
}
|
||||
|
||||
/// Create a new speaker separator with custom settings.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `min_silence_for_speaker_change` - Minimum silence duration (seconds) to consider a speaker change
|
||||
/// * `min_speech_duration` - Minimum speech duration (seconds) to consider a valid segment
|
||||
pub fn with_settings(
|
||||
min_silence_for_speaker_change: f64,
|
||||
min_speech_duration: f64,
|
||||
) -> Result<Self, VadError> {
|
||||
let vad = VoiceActivityDetector::builder()
|
||||
.sample_rate(VAD_SAMPLE_RATE)
|
||||
.chunk_size(512usize) // 512 samples for 16kHz
|
||||
.build()
|
||||
.map_err(|e: voice_activity_detector::Error| VadError::InitError(e.to_string()))?;
|
||||
|
||||
info!("VAD initialized with sample_rate={}, min_silence={}s, min_speech={}s",
|
||||
VAD_SAMPLE_RATE, min_silence_for_speaker_change, min_speech_duration);
|
||||
|
||||
Ok(Self {
|
||||
vad,
|
||||
min_silence_for_speaker_change,
|
||||
min_speech_duration,
|
||||
})
|
||||
}
|
||||
|
||||
/// Detect speech segments and assign speaker IDs based on silence gaps.
|
||||
///
|
||||
/// This is a simple heuristic: if there's a long enough silence gap,
|
||||
/// we assume a different speaker might be talking.
|
||||
pub fn detect_speakers(&mut self, audio: &[f32]) -> Result<Vec<SpeechSegment>, VadError> {
|
||||
let chunk_size = 512;
|
||||
let mut segments = Vec::new();
|
||||
let mut current_segment_start: Option<usize> = None;
|
||||
let mut last_speech_end: Option<usize> = None;
|
||||
let mut current_speaker_id = 0u32;
|
||||
|
||||
let min_silence_samples = (self.min_silence_for_speaker_change * VAD_SAMPLE_RATE as f64) as usize;
|
||||
let min_speech_samples = (self.min_speech_duration * VAD_SAMPLE_RATE as f64) as usize;
|
||||
|
||||
debug!("Processing {} samples for VAD", audio.len());
|
||||
|
||||
for (chunk_idx, chunk) in audio.chunks(chunk_size).enumerate() {
|
||||
if chunk.len() < chunk_size {
|
||||
// Skip incomplete chunks at the end
|
||||
continue;
|
||||
}
|
||||
|
||||
let sample_offset = chunk_idx * chunk_size;
|
||||
|
||||
// Predict voice activity for this chunk
|
||||
let probability = self.vad.predict(chunk.iter().copied());
|
||||
|
||||
let is_speech = probability > 0.5;
|
||||
|
||||
if is_speech {
|
||||
if current_segment_start.is_none() {
|
||||
// Check if we should change speaker
|
||||
if let Some(last_end) = last_speech_end {
|
||||
let silence_duration = sample_offset - last_end;
|
||||
if silence_duration >= min_silence_samples {
|
||||
current_speaker_id += 1;
|
||||
debug!("Speaker change detected at sample {} (silence: {}ms)",
|
||||
sample_offset, silence_duration * 1000 / VAD_SAMPLE_RATE as usize);
|
||||
}
|
||||
}
|
||||
current_segment_start = Some(sample_offset);
|
||||
}
|
||||
} else if let Some(start) = current_segment_start {
|
||||
// Speech ended
|
||||
let segment_duration = sample_offset - start;
|
||||
|
||||
if segment_duration >= min_speech_samples {
|
||||
segments.push(SpeechSegment {
|
||||
start_sample: start,
|
||||
end_sample: sample_offset,
|
||||
speaker_id: current_speaker_id,
|
||||
});
|
||||
}
|
||||
|
||||
last_speech_end = Some(sample_offset);
|
||||
current_segment_start = None;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle case where speech continues to the end
|
||||
if let Some(start) = current_segment_start {
|
||||
let segment_duration = audio.len() - start;
|
||||
if segment_duration >= min_speech_samples {
|
||||
segments.push(SpeechSegment {
|
||||
start_sample: start,
|
||||
end_sample: audio.len(),
|
||||
speaker_id: current_speaker_id,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
info!("Detected {} speech segments with {} speakers",
|
||||
segments.len(),
|
||||
segments.iter().map(|s| s.speaker_id).max().unwrap_or(0) + 1);
|
||||
|
||||
Ok(segments)
|
||||
}
|
||||
|
||||
/// Apply speaker labels to transcript segments based on VAD results.
|
||||
pub fn apply_speaker_labels(
|
||||
&mut self,
|
||||
audio: &[f32],
|
||||
mut transcript: Vec<TranscriptSegment>,
|
||||
) -> Result<Vec<TranscriptSegment>, VadError> {
|
||||
let speech_segments = self.detect_speakers(audio)?;
|
||||
|
||||
for segment in &mut transcript {
|
||||
// Find the speech segment that overlaps with this transcript segment
|
||||
let segment_mid = (segment.start + segment.end) / 2.0;
|
||||
|
||||
for speech in &speech_segments {
|
||||
let speech_start = speech.start_seconds();
|
||||
let speech_end = speech.end_seconds();
|
||||
|
||||
if segment_mid >= speech_start && segment_mid <= speech_end {
|
||||
segment.speaker = format!("Speaker {}", speech.speaker_id + 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(transcript)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SpeakerSeparator {
|
||||
fn default() -> Self {
|
||||
Self::new().expect("Failed to create speaker separator")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_speech_segment_timing() {
|
||||
let segment = SpeechSegment {
|
||||
start_sample: 16000,
|
||||
end_sample: 32000,
|
||||
speaker_id: 0,
|
||||
};
|
||||
|
||||
assert!((segment.start_seconds() - 1.0).abs() < 0.001);
|
||||
assert!((segment.end_seconds() - 2.0).abs() < 0.001);
|
||||
assert!((segment.duration_seconds() - 1.0).abs() < 0.001);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user