feat: use whisperx to allow diarization

2025-08-05 13:35:00 -07:00
parent 53f30437d9
commit 97e325c0b7
2 changed files with 24 additions and 4 deletions
@@ -6,6 +6,22 @@ This is a local recording, transcription, and summarisation script that listens

 Gotta run it locally. Sorry!

+1. Install Ollama, pull `llama3:8b`
+2. Install `uv`.
+3. Run `uvx python@3.12 whisperx` to install whisperx
+4. Run `uvx python@3.12 main.py`.
+
+### Diarization
+
+To download the models needed for diarization, you need to create a Hugging Face account. Then you should agree to the terms to access these two models:
+
+1. https://huggingface.co/pyannote/segmentation-3.0
+2. https://huggingface.co/pyannote/speaker-diarization-3.1
+
+Uncomment the `--hf_token` line and provide your Hugging Face token. This token needs read permissions for gated public repositories you have access to.
+
+Once you have run the script with this token at least once, you can remove your token and the script should work as normal.
+
 ## Feedback and Bugs

 If you have feedback or a bug report, please feel free to open a GitHub issue!
@@ -26,15 +26,19 @@ def record_audio():
 def transcribe_audio():
    print("🔠 Transcribing with WhisperX...")
    subprocess.run([
-        # "pipenv run python3 -m whisperx",
-        "whisper",
+        "uvx",
+        "whisperx",
        MEETING_FILE,
        "--device", "cpu",  # Use CPU
        "--language", "en",
-        # "--diarize", Put back when whisperX works.
+        # Uncomment the next line to enable diarization, requires you to download the model from Hugging Face.
+        # "--diarize",
+        "--compute_type", "float32",
        "--model", WHISPER_MODEL,
        "--output_format", "json",
        "--output_dir", "."
+        # You should only need to uncomment this once to perform the initial model download.
+        # "--hf_token", "your_huggingface_token_here"
    ], check=True)

 def summarize_with_local_model():
@@ -43,7 +47,7 @@ def summarize_with_local_model():
        transcript = json.load(f)

    text = "\n".join(
-        seg["text"]
+        f"{seg['speaker']}: {seg['text']}"
        for seg in transcript["segments"]
    )