generated from nhcarrigan/template
feat: diarisation maybe
This commit is contained in:
Generated
+155
-1
@@ -519,11 +519,13 @@ name = "chronara"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cpal",
|
"cpal",
|
||||||
|
"dasp",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"hound",
|
"hound",
|
||||||
"llama-cpp-2",
|
"llama-cpp-2",
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
"rustfft",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tauri",
|
"tauri",
|
||||||
@@ -813,12 +815,125 @@ dependencies = [
|
|||||||
"syn 2.0.114",
|
"syn 2.0.114",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dasp"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7381b67da416b639690ac77c73b86a7b5e64a29e31d1f75fb3b1102301ef355a"
|
||||||
|
dependencies = [
|
||||||
|
"dasp_envelope",
|
||||||
|
"dasp_frame",
|
||||||
|
"dasp_interpolate",
|
||||||
|
"dasp_peak",
|
||||||
|
"dasp_ring_buffer",
|
||||||
|
"dasp_rms",
|
||||||
|
"dasp_sample",
|
||||||
|
"dasp_signal",
|
||||||
|
"dasp_slice",
|
||||||
|
"dasp_window",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dasp_envelope"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8ec617ce7016f101a87fe85ed44180839744265fae73bb4aa43e7ece1b7668b6"
|
||||||
|
dependencies = [
|
||||||
|
"dasp_frame",
|
||||||
|
"dasp_peak",
|
||||||
|
"dasp_ring_buffer",
|
||||||
|
"dasp_rms",
|
||||||
|
"dasp_sample",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dasp_frame"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b2a3937f5fe2135702897535c8d4a5553f8b116f76c1529088797f2eee7c5cd6"
|
||||||
|
dependencies = [
|
||||||
|
"dasp_sample",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dasp_interpolate"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7fc975a6563bb7ca7ec0a6c784ead49983a21c24835b0bc96eea11ee407c7486"
|
||||||
|
dependencies = [
|
||||||
|
"dasp_frame",
|
||||||
|
"dasp_ring_buffer",
|
||||||
|
"dasp_sample",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dasp_peak"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5cf88559d79c21f3d8523d91250c397f9a15b5fc72fbb3f87fdb0a37b79915bf"
|
||||||
|
dependencies = [
|
||||||
|
"dasp_frame",
|
||||||
|
"dasp_sample",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dasp_ring_buffer"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "07d79e19b89618a543c4adec9c5a347fe378a19041699b3278e616e387511ea1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dasp_rms"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a6c5dcb30b7e5014486e2822537ea2beae50b19722ffe2ed7549ab03774575aa"
|
||||||
|
dependencies = [
|
||||||
|
"dasp_frame",
|
||||||
|
"dasp_ring_buffer",
|
||||||
|
"dasp_sample",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dasp_sample"
|
name = "dasp_sample"
|
||||||
version = "0.11.0"
|
version = "0.11.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0c87e182de0887fd5361989c677c4e8f5000cd9491d6d563161a8f3a5519fc7f"
|
checksum = "0c87e182de0887fd5361989c677c4e8f5000cd9491d6d563161a8f3a5519fc7f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dasp_signal"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "aa1ab7d01689c6ed4eae3d38fe1cea08cba761573fbd2d592528d55b421077e7"
|
||||||
|
dependencies = [
|
||||||
|
"dasp_envelope",
|
||||||
|
"dasp_frame",
|
||||||
|
"dasp_interpolate",
|
||||||
|
"dasp_peak",
|
||||||
|
"dasp_ring_buffer",
|
||||||
|
"dasp_rms",
|
||||||
|
"dasp_sample",
|
||||||
|
"dasp_window",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dasp_slice"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4e1c7335d58e7baedafa516cb361360ff38d6f4d3f9d9d5ee2a2fc8e27178fa1"
|
||||||
|
dependencies = [
|
||||||
|
"dasp_frame",
|
||||||
|
"dasp_sample",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dasp_window"
|
||||||
|
version = "0.11.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "99ded7b88821d2ce4e8b842c9f1c86ac911891ab89443cc1de750cae764c5076"
|
||||||
|
dependencies = [
|
||||||
|
"dasp_sample",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "der"
|
name = "der"
|
||||||
version = "0.7.10"
|
version = "0.7.10"
|
||||||
@@ -3222,6 +3337,15 @@ dependencies = [
|
|||||||
"syn 2.0.114",
|
"syn 2.0.114",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "primal-check"
|
||||||
|
version = "0.3.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dc0d895b311e3af9902528fbb8f928688abbd95872819320517cc24ca6b2bd08"
|
||||||
|
dependencies = [
|
||||||
|
"num-integer",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro-crate"
|
name = "proc-macro-crate"
|
||||||
version = "1.3.1"
|
version = "1.3.1"
|
||||||
@@ -3351,7 +3475,7 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
"socket2",
|
"socket2",
|
||||||
"tracing",
|
"tracing",
|
||||||
"windows-sys 0.59.0",
|
"windows-sys 0.60.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3639,6 +3763,20 @@ dependencies = [
|
|||||||
"semver",
|
"semver",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustfft"
|
||||||
|
version = "6.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89"
|
||||||
|
dependencies = [
|
||||||
|
"num-complex",
|
||||||
|
"num-integer",
|
||||||
|
"num-traits",
|
||||||
|
"primal-check",
|
||||||
|
"strength_reduce",
|
||||||
|
"transpose",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "1.1.3"
|
version = "1.1.3"
|
||||||
@@ -4142,6 +4280,12 @@ version = "1.2.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
|
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strength_reduce"
|
||||||
|
version = "0.2.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "string_cache"
|
name = "string_cache"
|
||||||
version = "0.8.9"
|
version = "0.8.9"
|
||||||
@@ -4945,6 +5089,16 @@ dependencies = [
|
|||||||
"tracing-log",
|
"tracing-log",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "transpose"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e"
|
||||||
|
dependencies = [
|
||||||
|
"num-integer",
|
||||||
|
"strength_reduce",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tray-icon"
|
name = "tray-icon"
|
||||||
version = "0.21.3"
|
version = "0.21.3"
|
||||||
|
|||||||
@@ -34,6 +34,10 @@ hound = "3.5" # WAV file handling
|
|||||||
# Voice activity detection
|
# Voice activity detection
|
||||||
voice_activity_detector = "0.2"
|
voice_activity_detector = "0.2"
|
||||||
|
|
||||||
|
# Audio analysis for speaker detection
|
||||||
|
dasp = "0.11" # Digital audio signal processing
|
||||||
|
rustfft = "6.2" # FFT for frequency analysis
|
||||||
|
|
||||||
# Async runtime
|
# Async runtime
|
||||||
tokio = { version = "1", features = ["full"] }
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
|
||||||
|
|||||||
+11
-57
@@ -277,6 +277,7 @@ async fn initialize_models(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Initialize storage
|
// Initialize storage
|
||||||
emit_log(&app_handle, &logs, "[Init] Initializing recording storage...");
|
emit_log(&app_handle, &logs, "[Init] Initializing recording storage...");
|
||||||
if let Ok(app_data_dir) = app_handle.path().app_data_dir() {
|
if let Ok(app_data_dir) = app_handle.path().app_data_dir() {
|
||||||
@@ -359,62 +360,6 @@ async fn stop_recording(
|
|||||||
Ok("Recording stopped".to_string())
|
Ok("Recording stopped".to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Stop recording and transcribe all at once (batch mode).
|
|
||||||
#[tauri::command]
|
|
||||||
async fn stop_recording_batch(
|
|
||||||
state: State<'_, AppState>,
|
|
||||||
app_handle: tauri::AppHandle,
|
|
||||||
) -> Result<Vec<TranscriptSegment>, String> {
|
|
||||||
let logs = Arc::clone(&state.logs);
|
|
||||||
|
|
||||||
emit_log(&app_handle, &logs, "[Audio] Stopping recording (batch mode)...");
|
|
||||||
|
|
||||||
// Get the audio samples
|
|
||||||
let audio_samples = {
|
|
||||||
let mut audio_guard = state.audio_capture.lock();
|
|
||||||
if let Some(ref mut capture) = *audio_guard {
|
|
||||||
capture.stop_recording()
|
|
||||||
} else {
|
|
||||||
return Err("No active recording".to_string());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let duration = audio_samples.len() as f32 / 16000.0;
|
|
||||||
emit_log(&app_handle, &logs, &format!("[Audio] Captured {:.1}s of audio", duration));
|
|
||||||
|
|
||||||
if audio_samples.is_empty() {
|
|
||||||
return Err("No audio captured".to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Transcribe the audio
|
|
||||||
emit_log(&app_handle, &logs, "[Transcribe] Starting transcription...");
|
|
||||||
|
|
||||||
let app_handle_clone = app_handle.clone();
|
|
||||||
let mut segments = {
|
|
||||||
let transcriber = state.transcriber.lock();
|
|
||||||
if !transcriber.is_loaded() {
|
|
||||||
emit_log(&app_handle, &logs, "[Transcribe ERROR] Whisper model not loaded");
|
|
||||||
return Err("Whisper model not loaded. Please ensure the model is downloaded.".to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
transcriber.transcribe_with_progress(&audio_samples, move |progress| {
|
|
||||||
// Emit progress event to frontend
|
|
||||||
let _ = app_handle_clone.emit("transcription-progress", progress);
|
|
||||||
})
|
|
||||||
.map_err(|e| format!("Transcription failed: {}", e))?
|
|
||||||
};
|
|
||||||
|
|
||||||
emit_log(&app_handle, &logs, &format!("[Transcribe] Got {} segments", segments.len()));
|
|
||||||
|
|
||||||
// Apply speaker labels using VAD
|
|
||||||
if let Some(ref mut separator) = *state.speaker_separator.lock() {
|
|
||||||
emit_log(&app_handle, &logs, "[Speaker] Applying speaker labels...");
|
|
||||||
segments = separator.apply_speaker_labels(&audio_samples, segments)
|
|
||||||
.map_err(|e| format!("Speaker separation failed: {}", e))?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(segments)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Transcribe a chunk of audio (for real-time transcription).
|
/// Transcribe a chunk of audio (for real-time transcription).
|
||||||
#[tauri::command]
|
#[tauri::command]
|
||||||
@@ -422,6 +367,7 @@ async fn transcribe_chunk(
|
|||||||
state: State<'_, AppState>,
|
state: State<'_, AppState>,
|
||||||
audio_data: Vec<f32>,
|
audio_data: Vec<f32>,
|
||||||
app_handle: tauri::AppHandle,
|
app_handle: tauri::AppHandle,
|
||||||
|
chunk_start_time: f64,
|
||||||
) -> Result<Vec<TranscriptSegment>, String> {
|
) -> Result<Vec<TranscriptSegment>, String> {
|
||||||
let transcriber = state.transcriber.lock();
|
let transcriber = state.transcriber.lock();
|
||||||
|
|
||||||
@@ -432,12 +378,20 @@ async fn transcribe_chunk(
|
|||||||
// Clone the app handle for the closure
|
// Clone the app handle for the closure
|
||||||
let app_handle_clone = app_handle.clone();
|
let app_handle_clone = app_handle.clone();
|
||||||
|
|
||||||
let segments = transcriber.transcribe_with_progress(&audio_data, move |progress| {
|
let mut segments = transcriber.transcribe_with_progress(&audio_data, move |progress| {
|
||||||
// Emit progress event to frontend
|
// Emit progress event to frontend
|
||||||
let _ = app_handle_clone.emit("transcription-progress", progress);
|
let _ = app_handle_clone.emit("transcription-progress", progress);
|
||||||
})
|
})
|
||||||
.map_err(|e| format!("Transcription failed: {}", e))?;
|
.map_err(|e| format!("Transcription failed: {}", e))?;
|
||||||
|
|
||||||
|
// Apply speaker detection using enhanced VAD
|
||||||
|
let mut separator_guard = state.speaker_separator.lock();
|
||||||
|
if let Some(ref mut separator) = *separator_guard {
|
||||||
|
// Process this chunk for speaker detection
|
||||||
|
segments = separator.apply_speaker_labels_to_chunk(&audio_data, segments, chunk_start_time)
|
||||||
|
.map_err(|e| format!("Speaker detection failed: {}", e))?;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(segments)
|
Ok(segments)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+254
-45
@@ -1,10 +1,12 @@
|
|||||||
//! Voice Activity Detection for basic speaker separation.
|
//! Enhanced Voice Activity Detection for speaker separation in group meetings.
|
||||||
//!
|
//!
|
||||||
//! This module uses the Silero VAD model to detect speech segments
|
//! This module uses the Silero VAD model combined with voice characteristics
|
||||||
//! and provides basic speaker separation based on silence gaps.
|
//! to provide improved speaker separation for meetings with multiple speakers.
|
||||||
|
|
||||||
|
use rustfft::{FftPlanner, num_complex::Complex};
|
||||||
|
use std::collections::HashMap;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tracing::{debug, info};
|
use tracing::{debug, info, warn};
|
||||||
use voice_activity_detector::VoiceActivityDetector;
|
use voice_activity_detector::VoiceActivityDetector;
|
||||||
|
|
||||||
use super::transcriber::TranscriptSegment;
|
use super::transcriber::TranscriptSegment;
|
||||||
@@ -20,12 +22,41 @@ pub enum VadError {
|
|||||||
ProcessingError(String),
|
ProcessingError(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A detected speech segment with timing information.
|
/// Voice characteristics for speaker fingerprinting.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct VoiceFingerprint {
|
||||||
|
/// Average pitch (fundamental frequency) in Hz
|
||||||
|
pub avg_pitch: f32,
|
||||||
|
/// Pitch variance
|
||||||
|
pub pitch_variance: f32,
|
||||||
|
/// Average energy level
|
||||||
|
pub avg_energy: f32,
|
||||||
|
/// Spectral centroid (brightness indicator)
|
||||||
|
pub spectral_centroid: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VoiceFingerprint {
|
||||||
|
/// Calculate similarity between two voice fingerprints (0.0 to 1.0)
|
||||||
|
pub fn similarity(&self, other: &VoiceFingerprint) -> f32 {
|
||||||
|
// Normalize differences for each feature
|
||||||
|
let pitch_diff = (self.avg_pitch - other.avg_pitch).abs() / 200.0; // Normalize by typical pitch range
|
||||||
|
let variance_diff = (self.pitch_variance - other.pitch_variance).abs() / 50.0;
|
||||||
|
let energy_diff = (self.avg_energy - other.avg_energy).abs() / 0.5;
|
||||||
|
let centroid_diff = (self.spectral_centroid - other.spectral_centroid).abs() / 1000.0;
|
||||||
|
|
||||||
|
// Calculate weighted similarity
|
||||||
|
let diff = pitch_diff * 0.4 + variance_diff * 0.2 + energy_diff * 0.2 + centroid_diff * 0.2;
|
||||||
|
(1.0 - diff.min(1.0)).max(0.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A detected speech segment with timing information and voice characteristics.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct SpeechSegment {
|
pub struct SpeechSegment {
|
||||||
pub start_sample: usize,
|
pub start_sample: usize,
|
||||||
pub end_sample: usize,
|
pub end_sample: usize,
|
||||||
pub speaker_id: u32,
|
pub speaker_id: u32,
|
||||||
|
pub fingerprint: Option<VoiceFingerprint>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SpeechSegment {
|
impl SpeechSegment {
|
||||||
@@ -45,19 +76,27 @@ impl SpeechSegment {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Voice Activity Detector for speaker separation.
|
/// Voice Activity Detector for speaker separation with enhanced tracking.
|
||||||
pub struct SpeakerSeparator {
|
pub struct SpeakerSeparator {
|
||||||
vad: VoiceActivityDetector,
|
vad: VoiceActivityDetector,
|
||||||
/// Minimum silence duration (in seconds) to consider a speaker change
|
/// Minimum silence duration (in seconds) to consider a speaker change
|
||||||
min_silence_for_speaker_change: f64,
|
_min_silence_for_speaker_change: f64,
|
||||||
/// Minimum speech duration (in seconds) to consider a valid segment
|
/// Minimum speech duration (in seconds) to consider a valid segment
|
||||||
min_speech_duration: f64,
|
min_speech_duration: f64,
|
||||||
|
/// Known speakers and their voice fingerprints
|
||||||
|
known_speakers: HashMap<u32, VoiceFingerprint>,
|
||||||
|
/// Next speaker ID to assign
|
||||||
|
next_speaker_id: u32,
|
||||||
|
/// Similarity threshold for matching speakers (0.0 to 1.0)
|
||||||
|
similarity_threshold: f32,
|
||||||
|
/// FFT planner for frequency analysis
|
||||||
|
fft_planner: FftPlanner<f32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SpeakerSeparator {
|
impl SpeakerSeparator {
|
||||||
/// Create a new speaker separator with default settings.
|
/// Create a new speaker separator with default settings.
|
||||||
pub fn new() -> Result<Self, VadError> {
|
pub fn new() -> Result<Self, VadError> {
|
||||||
Self::with_settings(1.5, 0.3)
|
Self::with_settings(0.8, 0.3) // Reduced silence threshold for group meetings
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a new speaker separator with custom settings.
|
/// Create a new speaker separator with custom settings.
|
||||||
@@ -75,86 +114,211 @@ impl SpeakerSeparator {
|
|||||||
.build()
|
.build()
|
||||||
.map_err(|e: voice_activity_detector::Error| VadError::InitError(e.to_string()))?;
|
.map_err(|e: voice_activity_detector::Error| VadError::InitError(e.to_string()))?;
|
||||||
|
|
||||||
info!("VAD initialized with sample_rate={}, min_silence={}s, min_speech={}s",
|
info!("Enhanced VAD initialized with sample_rate={}, min_silence={}s, min_speech={}s",
|
||||||
VAD_SAMPLE_RATE, min_silence_for_speaker_change, min_speech_duration);
|
VAD_SAMPLE_RATE, min_silence_for_speaker_change, min_speech_duration);
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
vad,
|
vad,
|
||||||
min_silence_for_speaker_change,
|
_min_silence_for_speaker_change: min_silence_for_speaker_change,
|
||||||
min_speech_duration,
|
min_speech_duration,
|
||||||
|
known_speakers: HashMap::new(),
|
||||||
|
next_speaker_id: 0,
|
||||||
|
similarity_threshold: 0.7,
|
||||||
|
fft_planner: FftPlanner::new(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Detect speech segments and assign speaker IDs based on silence gaps.
|
/// Extract voice fingerprint from an audio segment.
|
||||||
|
fn extract_fingerprint(&mut self, audio: &[f32]) -> Option<VoiceFingerprint> {
|
||||||
|
if audio.len() < 2048 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate average energy
|
||||||
|
let avg_energy = audio.iter().map(|s| s.abs()).sum::<f32>() / audio.len() as f32;
|
||||||
|
|
||||||
|
// Estimate pitch using autocorrelation (simplified)
|
||||||
|
let pitch = self.estimate_pitch(audio);
|
||||||
|
|
||||||
|
// Calculate spectral centroid
|
||||||
|
let spectral_centroid = self.calculate_spectral_centroid(audio);
|
||||||
|
|
||||||
|
// Calculate pitch variance over windows
|
||||||
|
let pitch_variance = self.calculate_pitch_variance(audio);
|
||||||
|
|
||||||
|
Some(VoiceFingerprint {
|
||||||
|
avg_pitch: pitch,
|
||||||
|
pitch_variance,
|
||||||
|
avg_energy,
|
||||||
|
spectral_centroid,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimate pitch using zero-crossing rate (simplified approach).
|
||||||
|
fn estimate_pitch(&self, audio: &[f32]) -> f32 {
|
||||||
|
let mut zero_crossings = 0;
|
||||||
|
let mut prev_sign = audio[0] >= 0.0;
|
||||||
|
|
||||||
|
for sample in audio.iter().skip(1) {
|
||||||
|
let current_sign = *sample >= 0.0;
|
||||||
|
if current_sign != prev_sign {
|
||||||
|
zero_crossings += 1;
|
||||||
|
}
|
||||||
|
prev_sign = current_sign;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert zero-crossing rate to approximate frequency
|
||||||
|
let zcr = zero_crossings as f32 / audio.len() as f32;
|
||||||
|
zcr * VAD_SAMPLE_RATE as f32 / 2.0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate spectral centroid (center of mass of spectrum).
|
||||||
|
fn calculate_spectral_centroid(&mut self, audio: &[f32]) -> f32 {
|
||||||
|
let fft_size = 2048.min(audio.len());
|
||||||
|
let mut input: Vec<Complex<f32>> = audio[..fft_size]
|
||||||
|
.iter()
|
||||||
|
.map(|&s| Complex::new(s, 0.0))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let fft = self.fft_planner.plan_fft_forward(fft_size);
|
||||||
|
fft.process(&mut input);
|
||||||
|
|
||||||
|
let mut weighted_sum = 0.0;
|
||||||
|
let mut magnitude_sum = 0.0;
|
||||||
|
|
||||||
|
for (i, complex) in input.iter().enumerate().take(fft_size / 2) {
|
||||||
|
let magnitude = complex.norm();
|
||||||
|
let frequency = i as f32 * VAD_SAMPLE_RATE as f32 / fft_size as f32;
|
||||||
|
weighted_sum += frequency * magnitude;
|
||||||
|
magnitude_sum += magnitude;
|
||||||
|
}
|
||||||
|
|
||||||
|
if magnitude_sum > 0.0 {
|
||||||
|
weighted_sum / magnitude_sum
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate pitch variance across multiple windows.
|
||||||
|
fn calculate_pitch_variance(&self, audio: &[f32]) -> f32 {
|
||||||
|
let window_size = 1024;
|
||||||
|
let num_windows = audio.len() / window_size;
|
||||||
|
|
||||||
|
if num_windows < 2 {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut pitches = Vec::new();
|
||||||
|
for i in 0..num_windows {
|
||||||
|
let start = i * window_size;
|
||||||
|
let end = (i + 1) * window_size;
|
||||||
|
let pitch = self.estimate_pitch(&audio[start..end]);
|
||||||
|
pitches.push(pitch);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate variance
|
||||||
|
let mean = pitches.iter().sum::<f32>() / pitches.len() as f32;
|
||||||
|
let variance = pitches.iter()
|
||||||
|
.map(|p| (p - mean).powi(2))
|
||||||
|
.sum::<f32>() / pitches.len() as f32;
|
||||||
|
|
||||||
|
variance.sqrt()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find or assign speaker ID based on voice fingerprint.
|
||||||
|
fn find_or_assign_speaker(&mut self, fingerprint: &VoiceFingerprint) -> u32 {
|
||||||
|
// Try to match with known speakers
|
||||||
|
let mut best_match = None;
|
||||||
|
let mut best_similarity = 0.0;
|
||||||
|
|
||||||
|
for (speaker_id, known_fp) in &self.known_speakers {
|
||||||
|
let similarity = fingerprint.similarity(known_fp);
|
||||||
|
if similarity > best_similarity && similarity >= self.similarity_threshold {
|
||||||
|
best_similarity = similarity;
|
||||||
|
best_match = Some(*speaker_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(speaker_id) = best_match {
|
||||||
|
debug!("Matched to existing speaker {} with similarity {:.2}", speaker_id, best_similarity);
|
||||||
|
speaker_id
|
||||||
|
} else {
|
||||||
|
// New speaker
|
||||||
|
let speaker_id = self.next_speaker_id;
|
||||||
|
self.next_speaker_id += 1;
|
||||||
|
self.known_speakers.insert(speaker_id, fingerprint.clone());
|
||||||
|
debug!("New speaker {} detected", speaker_id);
|
||||||
|
speaker_id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect speech segments and assign speaker IDs based on voice characteristics.
|
||||||
///
|
///
|
||||||
/// This is a simple heuristic: if there's a long enough silence gap,
|
/// This enhanced version uses voice fingerprinting to track speakers across
|
||||||
/// we assume a different speaker might be talking.
|
/// overlapping speech and short interruptions, making it suitable for group meetings.
|
||||||
pub fn detect_speakers(&mut self, audio: &[f32]) -> Result<Vec<SpeechSegment>, VadError> {
|
pub fn detect_speakers(&mut self, audio: &[f32]) -> Result<Vec<SpeechSegment>, VadError> {
|
||||||
let chunk_size = 512;
|
let chunk_size = 512;
|
||||||
let mut segments = Vec::new();
|
let mut raw_segments = Vec::new();
|
||||||
let mut current_segment_start: Option<usize> = None;
|
let mut current_segment_start: Option<usize> = None;
|
||||||
let mut last_speech_end: Option<usize> = None;
|
|
||||||
let mut current_speaker_id = 0u32;
|
|
||||||
|
|
||||||
let min_silence_samples = (self.min_silence_for_speaker_change * VAD_SAMPLE_RATE as f64) as usize;
|
|
||||||
let min_speech_samples = (self.min_speech_duration * VAD_SAMPLE_RATE as f64) as usize;
|
let min_speech_samples = (self.min_speech_duration * VAD_SAMPLE_RATE as f64) as usize;
|
||||||
|
|
||||||
debug!("Processing {} samples for VAD", audio.len());
|
debug!("Processing {} samples for enhanced VAD", audio.len());
|
||||||
|
|
||||||
|
// First pass: detect raw speech segments
|
||||||
for (chunk_idx, chunk) in audio.chunks(chunk_size).enumerate() {
|
for (chunk_idx, chunk) in audio.chunks(chunk_size).enumerate() {
|
||||||
if chunk.len() < chunk_size {
|
if chunk.len() < chunk_size {
|
||||||
// Skip incomplete chunks at the end
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let sample_offset = chunk_idx * chunk_size;
|
let sample_offset = chunk_idx * chunk_size;
|
||||||
|
|
||||||
// Predict voice activity for this chunk
|
|
||||||
let probability = self.vad.predict(chunk.iter().copied());
|
let probability = self.vad.predict(chunk.iter().copied());
|
||||||
|
|
||||||
let is_speech = probability > 0.5;
|
let is_speech = probability > 0.5;
|
||||||
|
|
||||||
if is_speech {
|
if is_speech {
|
||||||
if current_segment_start.is_none() {
|
if current_segment_start.is_none() {
|
||||||
// Check if we should change speaker
|
|
||||||
if let Some(last_end) = last_speech_end {
|
|
||||||
let silence_duration = sample_offset - last_end;
|
|
||||||
if silence_duration >= min_silence_samples {
|
|
||||||
current_speaker_id += 1;
|
|
||||||
debug!("Speaker change detected at sample {} (silence: {}ms)",
|
|
||||||
sample_offset, silence_duration * 1000 / VAD_SAMPLE_RATE as usize);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
current_segment_start = Some(sample_offset);
|
current_segment_start = Some(sample_offset);
|
||||||
}
|
}
|
||||||
} else if let Some(start) = current_segment_start {
|
} else if let Some(start) = current_segment_start {
|
||||||
// Speech ended
|
|
||||||
let segment_duration = sample_offset - start;
|
let segment_duration = sample_offset - start;
|
||||||
|
|
||||||
if segment_duration >= min_speech_samples {
|
if segment_duration >= min_speech_samples {
|
||||||
segments.push(SpeechSegment {
|
raw_segments.push((start, sample_offset));
|
||||||
start_sample: start,
|
|
||||||
end_sample: sample_offset,
|
|
||||||
speaker_id: current_speaker_id,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
last_speech_end = Some(sample_offset);
|
|
||||||
current_segment_start = None;
|
current_segment_start = None;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle case where speech continues to the end
|
// Handle speech continuing to the end
|
||||||
if let Some(start) = current_segment_start {
|
if let Some(start) = current_segment_start {
|
||||||
let segment_duration = audio.len() - start;
|
let segment_duration = audio.len() - start;
|
||||||
if segment_duration >= min_speech_samples {
|
if segment_duration >= min_speech_samples {
|
||||||
|
raw_segments.push((start, audio.len()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second pass: analyze voice characteristics and assign speakers
|
||||||
|
let mut segments = Vec::new();
|
||||||
|
for (start, end) in raw_segments {
|
||||||
|
let segment_audio = &audio[start..end];
|
||||||
|
|
||||||
|
// Extract voice fingerprint
|
||||||
|
let fingerprint = self.extract_fingerprint(segment_audio);
|
||||||
|
|
||||||
|
let speaker_id = if let Some(ref fp) = fingerprint {
|
||||||
|
self.find_or_assign_speaker(fp)
|
||||||
|
} else {
|
||||||
|
// Fallback: assign based on position
|
||||||
|
warn!("Could not extract fingerprint for segment at {}s", start as f64 / VAD_SAMPLE_RATE as f64);
|
||||||
|
self.next_speaker_id - 1 // Use last assigned speaker
|
||||||
|
};
|
||||||
|
|
||||||
segments.push(SpeechSegment {
|
segments.push(SpeechSegment {
|
||||||
start_sample: start,
|
start_sample: start,
|
||||||
end_sample: audio.len(),
|
end_sample: end,
|
||||||
speaker_id: current_speaker_id,
|
speaker_id,
|
||||||
|
fingerprint,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
info!("Detected {} speech segments with {} speakers",
|
info!("Detected {} speech segments with {} speakers",
|
||||||
segments.len(),
|
segments.len(),
|
||||||
@@ -188,6 +352,51 @@ impl SpeakerSeparator {
|
|||||||
|
|
||||||
Ok(transcript)
|
Ok(transcript)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Apply speaker labels to a chunk of transcript segments (for real-time processing).
|
||||||
|
///
|
||||||
|
/// This method is designed to work with the real-time transcription system,
|
||||||
|
/// maintaining speaker consistency across chunks.
|
||||||
|
pub fn apply_speaker_labels_to_chunk(
|
||||||
|
&mut self,
|
||||||
|
audio: &[f32],
|
||||||
|
mut transcript: Vec<TranscriptSegment>,
|
||||||
|
_chunk_start_time: f64,
|
||||||
|
) -> Result<Vec<TranscriptSegment>, VadError> {
|
||||||
|
// Detect speakers in this chunk
|
||||||
|
let speech_segments = self.detect_speakers(audio)?;
|
||||||
|
|
||||||
|
for segment in &mut transcript {
|
||||||
|
// Adjust times relative to chunk start
|
||||||
|
let segment_mid = (segment.start + segment.end) / 2.0;
|
||||||
|
|
||||||
|
// Find matching speech segment
|
||||||
|
for speech in &speech_segments {
|
||||||
|
let speech_start = speech.start_seconds();
|
||||||
|
let speech_end = speech.end_seconds();
|
||||||
|
|
||||||
|
if segment_mid >= speech_start && segment_mid <= speech_end {
|
||||||
|
// Use consistent speaker numbering across chunks
|
||||||
|
segment.speaker = format!("Speaker {}", speech.speaker_id + 1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up old speakers if we have too many (helps with memory in long meetings)
|
||||||
|
if self.known_speakers.len() > 10 {
|
||||||
|
warn!("Many speakers detected ({}), consider adjusting similarity threshold", self.known_speakers.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(transcript)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reset speaker tracking (useful between different meetings).
|
||||||
|
pub fn reset_speakers(&mut self) {
|
||||||
|
self.known_speakers.clear();
|
||||||
|
self.next_speaker_id = 0;
|
||||||
|
info!("Speaker tracking reset");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SpeakerSeparator {
|
impl Default for SpeakerSeparator {
|
||||||
|
|||||||
+6
-2
@@ -202,8 +202,10 @@ function App() {
|
|||||||
// If we have enough audio (at least 5 seconds worth at 16kHz)
|
// If we have enough audio (at least 5 seconds worth at 16kHz)
|
||||||
if (audioChunk.length >= 5 * 16000) {
|
if (audioChunk.length >= 5 * 16000) {
|
||||||
// Transcribe the chunk
|
// Transcribe the chunk
|
||||||
|
const chunkStartTime = totalProcessedSamples.current / 16000;
|
||||||
const newSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
|
const newSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
|
||||||
audioData: audioChunk
|
audioData: audioChunk,
|
||||||
|
chunkStartTime: chunkStartTime
|
||||||
});
|
});
|
||||||
|
|
||||||
if (newSegments.length > 0) {
|
if (newSegments.length > 0) {
|
||||||
@@ -293,9 +295,11 @@ function App() {
|
|||||||
|
|
||||||
if (finalChunk.length > 0) {
|
if (finalChunk.length > 0) {
|
||||||
console.log(`Processing final chunk of ${finalChunk.length} samples`);
|
console.log(`Processing final chunk of ${finalChunk.length} samples`);
|
||||||
|
const chunkStartTime = totalProcessedSamples.current / 16000;
|
||||||
// The progress will be updated via events from the backend
|
// The progress will be updated via events from the backend
|
||||||
const finalSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
|
const finalSegments = await invoke<TranscriptSegment[]>("transcribe_chunk", {
|
||||||
audioData: finalChunk
|
audioData: finalChunk,
|
||||||
|
chunkStartTime: chunkStartTime
|
||||||
});
|
});
|
||||||
|
|
||||||
if (finalSegments.length > 0) {
|
if (finalSegments.length > 0) {
|
||||||
|
|||||||
Reference in New Issue
Block a user