From 97e325c0b71f9f85e1e45d0c6d3792590955b4d0 Mon Sep 17 00:00:00 2001 From: Naomi Carrigan Date: Tue, 5 Aug 2025 13:35:00 -0700 Subject: [PATCH] feat: use whisperx to allow diarization --- README.md | 16 ++++++++++++++++ main.py | 12 ++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0971475..c1acddd 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,22 @@ This is a local recording, transcription, and summarisation script that listens Gotta run it locally. Sorry! +1. Install Ollama, pull `llama3:8b` +2. Install `uv`. +3. Run `uvx python@3.12 whisperx` to install whisperx +4. Run `uvx python@3.12 main.py`. + +### Diarization + +To download the models needed for diarization, you need to create a Hugging Face account. Then you should agree to the terms to access these two models: + +1. https://huggingface.co/pyannote/segmentation-3.0 +2. https://huggingface.co/pyannote/speaker-diarization-3.1 + +Uncomment the `--hf_token` line and provide your Hugging Face token. This token needs read permissions for gated public repositories you have access to. + +Once you have run the script with this token at least once, you can remove your token and the script should work as normal. + ## Feedback and Bugs If you have feedback or a bug report, please feel free to open a GitHub issue! diff --git a/main.py b/main.py index 6a73508..94d1264 100644 --- a/main.py +++ b/main.py @@ -26,15 +26,19 @@ def record_audio(): def transcribe_audio(): print("🔠 Transcribing with WhisperX...") subprocess.run([ - # "pipenv run python3 -m whisperx", - "whisper", + "uvx", + "whisperx", MEETING_FILE, "--device", "cpu", # Use CPU "--language", "en", - # "--diarize", Put back when whisperX works. + # Uncomment the next line to enable diarization, requires you to download the model from Hugging Face. + # "--diarize", + "--compute_type", "float32", "--model", WHISPER_MODEL, "--output_format", "json", "--output_dir", "." + # You should only need to uncomment this once to perform the initial model download. + # "--hf_token", "your_huggingface_token_here" ], check=True) def summarize_with_local_model(): @@ -43,7 +47,7 @@ def summarize_with_local_model(): transcript = json.load(f) text = "\n".join( - seg["text"] + f"{seg['speaker']}: {seg['text']}" for seg in transcript["segments"] )