feat: capture audio and loopback

This commit is contained in:
2026-01-28 18:08:25 -08:00
parent 74c334c939
commit c3acd8e7a6
4 changed files with 488 additions and 62 deletions
+8 -1
View File
@@ -52,8 +52,15 @@ windows = { version = "0.62", features = [
"Win32_System_Com", "Win32_System_Com",
"Win32_Foundation", "Win32_Foundation",
"Win32_Media_Audio", "Win32_Media_Audio",
"Win32_Media_Audio_Endpoints", "Win32_Devices_Properties",
"Win32_Media_KernelStreaming",
"Win32_System_Com_StructuredStorage",
"Win32_System_Threading", "Win32_System_Threading",
"Win32_Security",
"Win32_System_SystemServices",
"Win32_System_Variant",
"Win32_Media_Multimedia",
"Win32_UI_Shell_PropertiesSystem",
] } ] }
[patch.crates-io] [patch.crates-io]
+159 -54
View File
@@ -2,12 +2,15 @@
//! //!
//! This module handles audio recording in a thread-safe way by using //! This module handles audio recording in a thread-safe way by using
//! a shared buffer that can be accessed from the Tauri state. //! a shared buffer that can be accessed from the Tauri state.
//!
//! On Windows, it captures both microphone input AND system audio (loopback)
//! to record both sides of a meeting conversation.
use parking_lot::Mutex; use parking_lot::Mutex;
use std::sync::Arc; use std::sync::Arc;
use std::thread::{self, JoinHandle}; use std::thread::{self, JoinHandle};
use thiserror::Error; use thiserror::Error;
use tracing::{debug, error, info, warn}; use tracing::{error, info, warn};
/// Sample rate expected by Whisper (16kHz mono) /// Sample rate expected by Whisper (16kHz mono)
pub const WHISPER_SAMPLE_RATE: u32 = 16000; pub const WHISPER_SAMPLE_RATE: u32 = 16000;
@@ -33,6 +36,8 @@ pub struct AudioCapture {
is_recording: Arc<Mutex<bool>>, is_recording: Arc<Mutex<bool>>,
should_stop: Arc<Mutex<bool>>, should_stop: Arc<Mutex<bool>>,
recording_thread: Option<JoinHandle<()>>, recording_thread: Option<JoinHandle<()>>,
#[cfg(target_os = "windows")]
loopback_capture: Option<super::wasapi_loopback::WasapiLoopback>,
} }
// Implement Send + Sync manually since our struct only contains thread-safe types // Implement Send + Sync manually since our struct only contains thread-safe types
@@ -47,10 +52,13 @@ impl AudioCapture {
is_recording: Arc::new(Mutex::new(false)), is_recording: Arc::new(Mutex::new(false)),
should_stop: Arc::new(Mutex::new(false)), should_stop: Arc::new(Mutex::new(false)),
recording_thread: None, recording_thread: None,
#[cfg(target_os = "windows")]
loopback_capture: None,
}) })
} }
/// Start recording audio in a background thread. /// Start recording audio in a background thread.
/// Captures both microphone input and system audio (loopback) on Windows.
pub fn start_recording(&mut self) -> Result<(), AudioError> { pub fn start_recording(&mut self) -> Result<(), AudioError> {
if *self.is_recording.lock() { if *self.is_recording.lock() {
warn!("Already recording"); warn!("Already recording");
@@ -62,82 +70,83 @@ impl AudioCapture {
*self.should_stop.lock() = false; *self.should_stop.lock() = false;
*self.is_recording.lock() = true; *self.is_recording.lock() = true;
// Start WASAPI loopback capture for system audio (Windows only)
#[cfg(target_os = "windows")]
{
let mut loopback = super::wasapi_loopback::WasapiLoopback::new();
if let Err(e) = loopback.start_capture(Arc::clone(&self.buffer)) {
warn!("Failed to start WASAPI loopback: {}", e);
} else {
info!("WASAPI loopback capture initialized");
}
self.loopback_capture = Some(loopback);
}
let buffer = Arc::clone(&self.buffer); let buffer = Arc::clone(&self.buffer);
let is_recording = Arc::clone(&self.is_recording); let is_recording = Arc::clone(&self.is_recording);
let should_stop = Arc::clone(&self.should_stop); let should_stop = Arc::clone(&self.should_stop);
// Spawn a thread to handle audio capture // Spawn a thread to handle microphone capture via cpal
let handle = thread::spawn(move || { let handle = thread::spawn(move || {
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use cpal::{SampleRate, StreamConfig};
let host = cpal::default_host(); let host = cpal::default_host();
let device = match host.default_input_device() { // Get microphone (input device)
Some(d) => d, let input_device = host.default_input_device();
None => {
error!("No input device available"); if input_device.is_none() {
*is_recording.lock() = false; warn!("No microphone available - only system audio will be captured");
return;
} }
};
info!("Using audio input device: {}", device.name().unwrap_or_default()); let mut streams: Vec<cpal::Stream> = Vec::new();
// Get supported config // Set up microphone capture
let supported_config = match device.default_input_config() { if let Some(ref device) = input_device {
Ok(c) => c, info!("Microphone device: {}", device.name().unwrap_or_default());
Err(e) => {
error!("Failed to get input config: {}", e);
*is_recording.lock() = false;
return;
}
};
debug!("Supported config: {:?}", supported_config); if let Ok(config) = device.default_input_config() {
let sample_rate = config.sample_rate().0;
// We want 16kHz mono for Whisper let channels = config.channels();
let config = StreamConfig { let sample_format = config.sample_format();
channels: 1, info!("Microphone config: {} Hz, {} ch, {:?}", sample_rate, channels, sample_format);
sample_rate: SampleRate(WHISPER_SAMPLE_RATE),
buffer_size: cpal::BufferSize::Default,
};
let buffer_clone = Arc::clone(&buffer); let buffer_clone = Arc::clone(&buffer);
let err_fn = |err| error!("Audio stream error: {}", err); let stream_config: cpal::StreamConfig = config.into();
// Build the input stream let stream = build_input_stream(
let stream = match device.build_input_stream( device,
&config, &stream_config,
move |data: &[f32], _: &cpal::InputCallbackInfo| { sample_format,
buffer_clone.lock().extend_from_slice(data); sample_rate,
}, channels,
err_fn, buffer_clone,
None, "mic",
) { );
Ok(s) => s,
Err(e) => { if let Some(s) = stream {
error!("Failed to build input stream: {}", e); if s.play().is_ok() {
*is_recording.lock() = false; info!("Microphone capture started");
return; streams.push(s);
}
}
} }
};
if let Err(e) = stream.play() {
error!("Failed to start stream: {}", e);
*is_recording.lock() = false;
return;
} }
info!("Audio recording started"); // Note: Even if mic fails, WASAPI loopback may still be capturing system audio
if streams.is_empty() {
warn!("No microphone stream started - relying on WASAPI loopback for system audio");
} else {
info!("Audio recording started with {} microphone stream(s)", streams.len());
}
// Keep the stream alive until stop is requested // Keep the streams alive until stop is requested
while !*should_stop.lock() { while !*should_stop.lock() {
thread::sleep(std::time::Duration::from_millis(100)); thread::sleep(std::time::Duration::from_millis(100));
} }
// Stream is automatically stopped when dropped // Streams are automatically stopped when dropped
drop(stream); drop(streams);
*is_recording.lock() = false; *is_recording.lock() = false;
info!("Audio recording stopped"); info!("Audio recording stopped");
}); });
@@ -151,7 +160,13 @@ impl AudioCapture {
// Signal the thread to stop // Signal the thread to stop
*self.should_stop.lock() = true; *self.should_stop.lock() = true;
// Wait for the thread to finish // Stop WASAPI loopback capture
#[cfg(target_os = "windows")]
if let Some(ref mut loopback) = self.loopback_capture {
loopback.stop_capture();
}
// Wait for the microphone thread to finish
if let Some(handle) = self.recording_thread.take() { if let Some(handle) = self.recording_thread.take() {
let _ = handle.join(); let _ = handle.join();
} }
@@ -201,6 +216,96 @@ impl Drop for AudioCapture {
} }
} }
/// Build an input stream for the given device with automatic format handling.
fn build_input_stream(
device: &cpal::Device,
config: &cpal::StreamConfig,
sample_format: cpal::SampleFormat,
sample_rate: u32,
channels: u16,
buffer: Arc<Mutex<Vec<f32>>>,
source_name: &'static str,
) -> Option<cpal::Stream> {
use cpal::traits::DeviceTrait;
let err_fn = move |err| error!("Audio stream error ({}): {}", source_name, err);
// Create a processing closure that handles mono conversion and resampling
let make_processor = move || {
let buffer = Arc::clone(&buffer);
move |samples: Vec<f32>| {
// Convert to mono if stereo
let mono_samples: Vec<f32> = if channels > 1 {
samples
.chunks(channels as usize)
.map(|chunk| chunk.iter().sum::<f32>() / channels as f32)
.collect()
} else {
samples
};
// Resample to 16kHz if needed
let resampled = if sample_rate != WHISPER_SAMPLE_RATE {
resample(&mono_samples, sample_rate, WHISPER_SAMPLE_RATE)
} else {
mono_samples
};
buffer.lock().extend_from_slice(&resampled);
}
};
let stream = match sample_format {
cpal::SampleFormat::F32 => {
let process = make_processor();
device.build_input_stream(
config,
move |data: &[f32], _: &cpal::InputCallbackInfo| {
process(data.to_vec());
},
err_fn,
None,
)
}
cpal::SampleFormat::I16 => {
let process = make_processor();
device.build_input_stream(
config,
move |data: &[i16], _: &cpal::InputCallbackInfo| {
let samples: Vec<f32> = data.iter().map(|&s| s as f32 / 32768.0).collect();
process(samples);
},
err_fn,
None,
)
}
cpal::SampleFormat::I32 => {
let process = make_processor();
device.build_input_stream(
config,
move |data: &[i32], _: &cpal::InputCallbackInfo| {
let samples: Vec<f32> = data.iter().map(|&s| s as f32 / 2147483648.0).collect();
process(samples);
},
err_fn,
None,
)
}
format => {
error!("Unsupported sample format for {}: {:?}", source_name, format);
return None;
}
};
match stream {
Ok(s) => Some(s),
Err(e) => {
warn!("Failed to build {} stream: {}", source_name, e);
None
}
}
}
/// Convert audio samples from i16 to f32 format. /// Convert audio samples from i16 to f32 format.
pub fn i16_to_f32(samples: &[i16]) -> Vec<f32> { pub fn i16_to_f32(samples: &[i16]) -> Vec<f32> {
samples.iter().map(|&s| s as f32 / 32768.0).collect() samples.iter().map(|&s| s as f32 / 32768.0).collect()
+3
View File
@@ -11,6 +11,9 @@ pub mod transcriber;
pub mod summarizer; pub mod summarizer;
pub mod vad; pub mod vad;
#[cfg(target_os = "windows")]
pub mod wasapi_loopback;
pub use audio::AudioCapture; pub use audio::AudioCapture;
pub use transcriber::WhisperTranscriber; pub use transcriber::WhisperTranscriber;
pub use summarizer::LlamaSummarizer; pub use summarizer::LlamaSummarizer;
+311
View File
@@ -0,0 +1,311 @@
//! WASAPI loopback capture for Windows.
//!
//! This module captures system audio (what's playing through speakers)
//! using Windows Audio Session API (WASAPI) in loopback mode.
use parking_lot::Mutex;
use std::sync::Arc;
use std::thread::{self, JoinHandle};
use tracing::{error, info, warn};
use windows::Win32::Media::Audio;
use windows::Win32::System::Com;
/// WASAPI loopback capture state.
pub struct WasapiLoopback {
is_capturing: Arc<Mutex<bool>>,
should_stop: Arc<Mutex<bool>>,
capture_thread: Option<JoinHandle<()>>,
}
impl WasapiLoopback {
/// Create a new WASAPI loopback capture instance.
pub fn new() -> Self {
Self {
is_capturing: Arc::new(Mutex::new(false)),
should_stop: Arc::new(Mutex::new(false)),
capture_thread: None,
}
}
/// Start capturing system audio in loopback mode.
pub fn start_capture(&mut self, output_buffer: Arc<Mutex<Vec<f32>>>) -> Result<(), String> {
if *self.is_capturing.lock() {
warn!("WASAPI loopback already capturing");
return Ok(());
}
*self.should_stop.lock() = false;
*self.is_capturing.lock() = true;
let is_capturing = Arc::clone(&self.is_capturing);
let should_stop = Arc::clone(&self.should_stop);
let handle = thread::spawn(move || {
if let Err(e) = capture_loopback_audio(output_buffer, should_stop.clone()) {
error!("WASAPI loopback capture error: {}", e);
}
*is_capturing.lock() = false;
info!("WASAPI loopback capture stopped");
});
self.capture_thread = Some(handle);
Ok(())
}
/// Stop capturing.
pub fn stop_capture(&mut self) {
*self.should_stop.lock() = true;
if let Some(handle) = self.capture_thread.take() {
let _ = handle.join();
}
}
/// Check if currently capturing.
pub fn is_capturing(&self) -> bool {
*self.is_capturing.lock()
}
}
impl Drop for WasapiLoopback {
fn drop(&mut self) {
self.stop_capture();
}
}
/// Target sample rate for Whisper (16kHz)
const TARGET_SAMPLE_RATE: u32 = 16000;
/// Capture loopback audio from the default render device.
fn capture_loopback_audio(
buffer: Arc<Mutex<Vec<f32>>>,
should_stop: Arc<Mutex<bool>>,
) -> Result<(), String> {
unsafe {
// Initialize COM
Com::CoInitializeEx(None, Com::COINIT_MULTITHREADED)
.ok()
.map_err(|e| format!("Failed to initialize COM: {}", e))?;
// Create device enumerator
let enumerator: Audio::IMMDeviceEnumerator =
Com::CoCreateInstance(&Audio::MMDeviceEnumerator, None, Com::CLSCTX_ALL)
.map_err(|e| format!("Failed to create device enumerator: {}", e))?;
// Get default render (output) device - this is key for loopback!
let device = enumerator
.GetDefaultAudioEndpoint(Audio::eRender, Audio::eConsole)
.map_err(|e| format!("Failed to get default render device: {}", e))?;
// Get device name for logging
if let Ok(id) = device.GetId() {
info!("WASAPI loopback device: {:?}", id.to_string());
}
// Activate audio client
let audio_client: Audio::IAudioClient = device
.Activate(Com::CLSCTX_ALL, None)
.map_err(|e| format!("Failed to activate audio client: {}", e))?;
// Get the mix format (what the device is actually using)
let mix_format = audio_client
.GetMixFormat()
.map_err(|e| format!("Failed to get mix format: {}", e))?;
let format = &*mix_format;
let sample_rate = format.nSamplesPerSec;
let channels = format.nChannels;
let bits_per_sample = format.wBitsPerSample;
let block_align = format.nBlockAlign;
info!(
"WASAPI loopback format: {} Hz, {} ch, {} bits",
sample_rate, channels, bits_per_sample
);
// Initialize audio client in loopback mode
// Key flags: AUDCLNT_STREAMFLAGS_LOOPBACK for capturing output
// Must use shared mode (not exclusive) for loopback
let buffer_duration = 10_000_000i64; // 1 second in 100-nanosecond units
audio_client
.Initialize(
Audio::AUDCLNT_SHAREMODE_SHARED,
Audio::AUDCLNT_STREAMFLAGS_LOOPBACK,
buffer_duration,
0,
mix_format,
None,
)
.map_err(|e| format!("Failed to initialize audio client: {}", e))?;
// Get capture client
let capture_client: Audio::IAudioCaptureClient = audio_client
.GetService()
.map_err(|e| format!("Failed to get capture client: {}", e))?;
// Start capturing
audio_client
.Start()
.map_err(|e| format!("Failed to start audio client: {}", e))?;
info!("WASAPI loopback capture started");
// Capture loop - use polling since event mode doesn't work for loopback
while !*should_stop.lock() {
// Sleep a bit to avoid busy-waiting (10ms = 100Hz polling)
thread::sleep(std::time::Duration::from_millis(10));
// Get available frames
let frames_available = match capture_client.GetNextPacketSize() {
Ok(frames) => frames,
Err(e) => {
warn!("Failed to get packet size: {}", e);
continue;
}
};
if frames_available == 0 {
continue;
}
// Get buffer
let mut data_ptr: *mut u8 = std::ptr::null_mut();
let mut num_frames: u32 = 0;
let mut flags: u32 = 0;
if let Err(e) = capture_client.GetBuffer(
&mut data_ptr,
&mut num_frames,
&mut flags,
None,
None,
) {
warn!("Failed to get buffer: {}", e);
continue;
}
if num_frames > 0 && !data_ptr.is_null() {
// Convert to f32 samples
let samples = convert_to_f32(
data_ptr,
num_frames as usize,
channels as usize,
bits_per_sample,
block_align as usize,
);
// Convert to mono
let mono_samples = to_mono(&samples, channels as usize);
// Resample to 16kHz if needed
let resampled = if sample_rate != TARGET_SAMPLE_RATE {
resample(&mono_samples, sample_rate, TARGET_SAMPLE_RATE)
} else {
mono_samples
};
// Add to buffer
buffer.lock().extend_from_slice(&resampled);
}
// Release buffer
if let Err(e) = capture_client.ReleaseBuffer(num_frames) {
warn!("Failed to release buffer: {}", e);
}
}
// Stop and cleanup
let _ = audio_client.Stop();
Ok(())
}
}
/// Convert raw audio bytes to f32 samples.
fn convert_to_f32(
data: *mut u8,
num_frames: usize,
channels: usize,
bits_per_sample: u16,
block_align: usize,
) -> Vec<f32> {
let total_samples = num_frames * channels;
let mut samples = Vec::with_capacity(total_samples);
unsafe {
match bits_per_sample {
16 => {
let ptr = data as *const i16;
for i in 0..total_samples {
let sample = *ptr.add(i);
samples.push(sample as f32 / 32768.0);
}
}
32 => {
// Could be f32 or i32 - WASAPI mix format is usually f32
let ptr = data as *const f32;
for i in 0..total_samples {
samples.push(*ptr.add(i));
}
}
24 => {
// 24-bit samples packed in 3 bytes
for i in 0..total_samples {
let offset = (i / channels) * block_align + (i % channels) * 3;
let b0 = *data.add(offset) as i32;
let b1 = *data.add(offset + 1) as i32;
let b2 = *data.add(offset + 2) as i32;
let sample = (b2 << 16) | (b1 << 8) | b0;
// Sign extend from 24 to 32 bits
let sample = if sample & 0x800000 != 0 {
sample | 0xFF000000u32 as i32
} else {
sample
};
samples.push(sample as f32 / 8388608.0);
}
}
_ => {
warn!("Unsupported bits per sample: {}", bits_per_sample);
}
}
}
samples
}
/// Convert multi-channel audio to mono by averaging channels.
fn to_mono(samples: &[f32], channels: usize) -> Vec<f32> {
if channels == 1 {
return samples.to_vec();
}
samples
.chunks(channels)
.map(|chunk| chunk.iter().sum::<f32>() / channels as f32)
.collect()
}
/// Simple linear interpolation resampling.
fn resample(samples: &[f32], from_rate: u32, to_rate: u32) -> Vec<f32> {
if from_rate == to_rate || samples.is_empty() {
return samples.to_vec();
}
let ratio = to_rate as f64 / from_rate as f64;
let new_len = (samples.len() as f64 * ratio) as usize;
let mut output = Vec::with_capacity(new_len);
for i in 0..new_len {
let src_idx = i as f64 / ratio;
let src_idx_floor = src_idx.floor() as usize;
let src_idx_ceil = (src_idx_floor + 1).min(samples.len() - 1);
let frac = src_idx - src_idx_floor as f64;
let sample =
samples[src_idx_floor] as f64 * (1.0 - frac) + samples[src_idx_ceil] as f64 * frac;
output.push(sample as f32);
}
output
}