chronara/src/backend/models/llm.py

"""Local LLM for meeting summarization using Llama."""

from pathlib import Path
from typing import Optional

from llama_cpp import Llama


class LlamaSummarizer:
    """Handles meeting summarization using local Llama model."""

    def __init__(self, model_dir: Path, model_size: str = "1B"):
        """Initialize Llama model."""
        self.model_dir = model_dir
        self.is_loaded = False

        model_path = model_dir / f"Llama-3.2-{model_size}-Instruct-Q4_K_M.gguf"

        try:
            self.llm = Llama(
                model_path=str(model_path),
                n_ctx=8192,  # Context window
                n_threads=4,  # CPU threads
                n_gpu_layers=-1,  # Use GPU if available
                verbose=False,
            )
            self.is_loaded = True
        except Exception as e:
            print(f"Failed to load Llama model: {e}")
            self.is_loaded = False

    async def summarize(self, transcript: str) -> Optional[str]:
        """Generate a meeting summary from transcript."""
        if not self.is_loaded:
            return None

        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant that creates concise meeting summaries. Focus on:
- Key decisions made
- Action items and who owns them
- Important discussions and their outcomes
- Next steps

Keep the summary structured and easy to scan.<|eot_id|><|start_header_id|>user<|end_header_id|>

Please summarize this meeting transcript:

{transcript}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Meeting Summary:
"""

        try:
            response = self.llm(
                prompt,
                max_tokens=1024,
                temperature=0.7,
                top_p=0.9,
                stop=["<|eot_id|>", "<|end_of_text|>"],
            )

            return response["choices"][0]["text"].strip()

        except Exception as e:
            print(f"Summarization error: {e}")
            return None