From 97e325c0b71f9f85e1e45d0c6d3792590955b4d0 Mon Sep 17 00:00:00 2001
From: Naomi Carrigan <commits@nhcarrigan.com>
Date: Tue, 5 Aug 2025 13:35:00 -0700
Subject: [PATCH] feat: use whisperx to allow diarization

---
 README.md | 16 ++++++++++++++++
 main.py   | 12 ++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 0971475..c1acddd 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,22 @@ This is a local recording, transcription, and summarisation script that listens
 
 Gotta run it locally. Sorry!
 
+1. Install Ollama, pull `llama3:8b`
+2. Install `uv`.
+3. Run `uvx python@3.12 whisperx` to install whisperx
+4. Run `uvx python@3.12 main.py`.
+
+### Diarization
+
+To download the models needed for diarization, you need to create a Hugging Face account. Then you should agree to the terms to access these two models:
+
+1. https://huggingface.co/pyannote/segmentation-3.0
+2. https://huggingface.co/pyannote/speaker-diarization-3.1
+
+Uncomment the `--hf_token` line and provide your Hugging Face token. This token needs read permissions for gated public repositories you have access to.
+
+Once you have run the script with this token at least once, you can remove your token and the script should work as normal.
+
 ## Feedback and Bugs
 
 If you have feedback or a bug report, please feel free to open a GitHub issue!
diff --git a/main.py b/main.py
index 6a73508..94d1264 100644
--- a/main.py
+++ b/main.py
@@ -26,15 +26,19 @@ def record_audio():
 def transcribe_audio():
     print("🔠 Transcribing with WhisperX...")
     subprocess.run([
-        # "pipenv run python3 -m whisperx",
-        "whisper",
+        "uvx",
+        "whisperx",
         MEETING_FILE,
         "--device", "cpu",  # Use CPU
         "--language", "en",
-        # "--diarize", Put back when whisperX works.
+        # Uncomment the next line to enable diarization, requires you to download the model from Hugging Face.
+        # "--diarize",
+        "--compute_type", "float32",
         "--model", WHISPER_MODEL,
         "--output_format", "json",
         "--output_dir", "."
+        # You should only need to uncomment this once to perform the initial model download.
+        # "--hf_token", "your_huggingface_token_here"
     ], check=True)
 
 def summarize_with_local_model():
@@ -43,7 +47,7 @@ def summarize_with_local_model():
         transcript = json.load(f)
 
     text = "\n".join(
-        seg["text"]
+        f"{seg['speaker']}: {seg['text']}"
         for seg in transcript["segments"]
     )