generated from nhcarrigan/template
feat: add meeting transcription app scaffolding
- Add Python backend structure with FastAPI for transcription/summarization - Add React UI with audio recording, transcript, and summary views - Configure Tauri to manage Python backend lifecycle - Set up Windows cross-compilation with cargo-xwin - Add Gitea CI workflow for lint, test, and multi-platform builds - Configure ESLint, Prettier, and Vitest for code quality Note: App scaffolding only - Python env and models not yet set up
This commit is contained in:
@@ -0,0 +1 @@
|
||||
"""Chronara backend - Local meeting transcription and summarization."""
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Main FastAPI application for Chronara."""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, WebSocket
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from .models.audio import AudioProcessor
|
||||
from .models.llm import LlamaSummarizer
|
||||
from .models.transcriber import WhisperXTranscriber
|
||||
|
||||
app = FastAPI(title="Chronara API", version="0.1.0")
|
||||
|
||||
# Enable CORS for Tauri frontend
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["tauri://localhost", "http://localhost:*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Initialize models
|
||||
MODEL_DIR = Path(__file__).parent.parent.parent / "models"
|
||||
transcriber = WhisperXTranscriber(model_dir=MODEL_DIR)
|
||||
summarizer = LlamaSummarizer(model_dir=MODEL_DIR)
|
||||
audio_processor = AudioProcessor()
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Check if the API is running and models are loaded."""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"models": {
|
||||
"whisper": transcriber.is_loaded,
|
||||
"llama": summarizer.is_loaded,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@app.websocket("/ws/transcribe")
|
||||
async def transcribe_audio(websocket: WebSocket):
|
||||
"""WebSocket endpoint for real-time audio transcription."""
|
||||
await websocket.accept()
|
||||
|
||||
try:
|
||||
while True:
|
||||
# Receive audio chunk
|
||||
audio_data = await websocket.receive_bytes()
|
||||
|
||||
# Process audio
|
||||
audio_chunk = audio_processor.process_chunk(audio_data)
|
||||
|
||||
# Transcribe if we have enough audio
|
||||
if audio_processor.has_speech(audio_chunk):
|
||||
result = await transcriber.transcribe_chunk(audio_chunk)
|
||||
|
||||
if result:
|
||||
await websocket.send_json({
|
||||
"type": "transcription",
|
||||
"data": result,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
await websocket.close(code=1000, reason=str(e))
|
||||
|
||||
|
||||
@app.post("/summarize")
|
||||
async def summarize_transcript(transcript: str):
|
||||
"""Summarize a meeting transcript."""
|
||||
summary = await summarizer.summarize(transcript)
|
||||
return {"summary": summary}
|
||||
@@ -0,0 +1 @@
|
||||
"""Model modules for Chronara."""
|
||||
@@ -0,0 +1,73 @@
|
||||
"""Audio processing utilities."""
|
||||
|
||||
import io
|
||||
import wave
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pyaudio
|
||||
|
||||
|
||||
class AudioProcessor:
|
||||
"""Handles audio capture and processing."""
|
||||
|
||||
def __init__(self, sample_rate: int = 16000, channels: int = 1):
|
||||
"""Initialize audio processor."""
|
||||
self.sample_rate = sample_rate
|
||||
self.channels = channels
|
||||
self.chunk_size = 1024
|
||||
self.format = pyaudio.paInt16
|
||||
|
||||
# Initialize PyAudio
|
||||
self.audio = pyaudio.PyAudio()
|
||||
|
||||
# Audio buffer for accumulating chunks
|
||||
self.buffer = []
|
||||
self.min_speech_duration = 0.5 # seconds
|
||||
|
||||
def start_recording(self) -> pyaudio.Stream:
|
||||
"""Start audio recording stream."""
|
||||
stream = self.audio.open(
|
||||
format=self.format,
|
||||
channels=self.channels,
|
||||
rate=self.sample_rate,
|
||||
input=True,
|
||||
frames_per_buffer=self.chunk_size,
|
||||
)
|
||||
return stream
|
||||
|
||||
def stop_recording(self, stream: pyaudio.Stream) -> None:
|
||||
"""Stop audio recording."""
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
|
||||
def process_chunk(self, audio_bytes: bytes) -> np.ndarray:
|
||||
"""Convert audio bytes to numpy array."""
|
||||
# Convert bytes to numpy array
|
||||
audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
|
||||
|
||||
# Normalize to [-1, 1]
|
||||
audio_float = audio_array.astype(np.float32) / 32768.0
|
||||
|
||||
return audio_float
|
||||
|
||||
def has_speech(self, audio_chunk: np.ndarray, energy_threshold: float = 0.01) -> bool:
|
||||
"""Simple voice activity detection based on energy."""
|
||||
# Calculate RMS energy
|
||||
energy = np.sqrt(np.mean(audio_chunk**2))
|
||||
|
||||
# Check if energy exceeds threshold
|
||||
return energy > energy_threshold
|
||||
|
||||
def save_audio(self, audio_data: bytes, filepath: str) -> None:
|
||||
"""Save audio data to WAV file."""
|
||||
with wave.open(filepath, "wb") as wf:
|
||||
wf.setnchannels(self.channels)
|
||||
wf.setsampwidth(self.audio.get_sample_size(self.format))
|
||||
wf.setframerate(self.sample_rate)
|
||||
wf.writeframes(audio_data)
|
||||
|
||||
def __del__(self):
|
||||
"""Cleanup PyAudio."""
|
||||
if hasattr(self, "audio"):
|
||||
self.audio.terminate()
|
||||
@@ -0,0 +1,67 @@
|
||||
"""Local LLM for meeting summarization using Llama."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from llama_cpp import Llama
|
||||
|
||||
|
||||
class LlamaSummarizer:
|
||||
"""Handles meeting summarization using local Llama model."""
|
||||
|
||||
def __init__(self, model_dir: Path, model_size: str = "1B"):
|
||||
"""Initialize Llama model."""
|
||||
self.model_dir = model_dir
|
||||
self.is_loaded = False
|
||||
|
||||
model_path = model_dir / f"llama-3.2-{model_size}-instruct-Q4_K_M.gguf"
|
||||
|
||||
try:
|
||||
self.llm = Llama(
|
||||
model_path=str(model_path),
|
||||
n_ctx=8192, # Context window
|
||||
n_threads=4, # CPU threads
|
||||
n_gpu_layers=-1, # Use GPU if available
|
||||
verbose=False,
|
||||
)
|
||||
self.is_loaded = True
|
||||
except Exception as e:
|
||||
print(f"Failed to load Llama model: {e}")
|
||||
self.is_loaded = False
|
||||
|
||||
async def summarize(self, transcript: str) -> Optional[str]:
|
||||
"""Generate a meeting summary from transcript."""
|
||||
if not self.is_loaded:
|
||||
return None
|
||||
|
||||
prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
||||
|
||||
You are a helpful assistant that creates concise meeting summaries. Focus on:
|
||||
- Key decisions made
|
||||
- Action items and who owns them
|
||||
- Important discussions and their outcomes
|
||||
- Next steps
|
||||
|
||||
Keep the summary structured and easy to scan.<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||
|
||||
Please summarize this meeting transcript:
|
||||
|
||||
{transcript}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
Meeting Summary:
|
||||
"""
|
||||
|
||||
try:
|
||||
response = self.llm(
|
||||
prompt,
|
||||
max_tokens=1024,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
stop=["<|eot_id|>", "<|end_of_text|>"],
|
||||
)
|
||||
|
||||
return response["choices"][0]["text"].strip()
|
||||
|
||||
except Exception as e:
|
||||
print(f"Summarization error: {e}")
|
||||
return None
|
||||
@@ -0,0 +1,88 @@
|
||||
"""WhisperX transcription with speaker diarization."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import whisperx
|
||||
|
||||
|
||||
class WhisperXTranscriber:
|
||||
"""Handles audio transcription and speaker diarization using WhisperX."""
|
||||
|
||||
def __init__(self, model_dir: Path, model_size: str = "base"):
|
||||
"""Initialize WhisperX with local models."""
|
||||
self.model_dir = model_dir
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.compute_type = "float16" if self.device == "cuda" else "int8"
|
||||
self.is_loaded = False
|
||||
|
||||
try:
|
||||
# Load ASR model
|
||||
self.model = whisperx.load_model(
|
||||
model_size,
|
||||
self.device,
|
||||
compute_type=self.compute_type,
|
||||
download_root=str(model_dir / "whisper"),
|
||||
)
|
||||
|
||||
# Load alignment model
|
||||
self.align_model, self.align_metadata = whisperx.load_align_model(
|
||||
language_code="en",
|
||||
device=self.device,
|
||||
model_dir=str(model_dir / "alignment"),
|
||||
)
|
||||
|
||||
# Load diarization pipeline
|
||||
self.diarize_model = whisperx.DiarizationPipeline(
|
||||
device=self.device,
|
||||
model_name=str(model_dir / "diarization"),
|
||||
)
|
||||
|
||||
self.is_loaded = True
|
||||
except Exception as e:
|
||||
print(f"Failed to load WhisperX models: {e}")
|
||||
self.is_loaded = False
|
||||
|
||||
async def transcribe_chunk(self, audio_chunk: np.ndarray) -> Optional[dict[str, Any]]:
|
||||
"""Transcribe an audio chunk with speaker diarization."""
|
||||
if not self.is_loaded:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Transcribe
|
||||
result = self.model.transcribe(
|
||||
audio_chunk,
|
||||
batch_size=16,
|
||||
)
|
||||
|
||||
# Align whisper output
|
||||
result = whisperx.align(
|
||||
result["segments"],
|
||||
self.align_model,
|
||||
self.align_metadata,
|
||||
audio_chunk,
|
||||
self.device,
|
||||
)
|
||||
|
||||
# Diarize
|
||||
diarize_segments = self.diarize_model(audio_chunk)
|
||||
result = whisperx.assign_word_speakers(diarize_segments, result)
|
||||
|
||||
# Format output
|
||||
formatted_result = []
|
||||
for segment in result["segments"]:
|
||||
formatted_result.append({
|
||||
"start": segment["start"],
|
||||
"end": segment["end"],
|
||||
"text": segment["text"],
|
||||
"speaker": segment.get("speaker", "Unknown"),
|
||||
})
|
||||
|
||||
return {"segments": formatted_result}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Transcription error: {e}")
|
||||
return None
|
||||
Reference in New Issue
Block a user