ephemere/python/cohort/evaluate_technical_proficiency.py

"""Evaluate the technical proficiency of cohort applicants using their GitHub profiles.

Fetches each applicant's public GitHub repositories and scores their proficiency as
Beginner, Intermediate, or Advanced based on language variety, repo count, commit
activity, and presence of certain technologies.

Data files (place in data/):
  - applicants_to_evaluate.json  List of applicants with GitHub usernames

Outputs (written to data/):
  - proficiency_evaluations.json  Proficiency scores and tech stacks per applicant

Env vars:
  - None (uses public GitHub API; may be rate-limited without authentication)
"""

import json
import re
import time
import urllib.error
import urllib.request
from pathlib import Path

DATA_DIR = Path(__file__).parent.parent.parent / "data"

# GitHub API (no auth needed for public repos, but rate limited)
GITHUB_API = "https://api.github.com"


def extract_github_info(url: str) -> tuple[str | None, str | None]:
    """Extract owner and repo from GitHub URL."""
    # Handle various GitHub URL formats
    patterns = [
        r"github\.com/([^/]+)/([^/\s?#]+)",  # github.com/owner/repo
        r"github\.com/([^/\s?#]+)/?$",  # github.com/owner (profile)
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            groups = match.groups()
            if len(groups) == 2:
                return groups[0], groups[1].rstrip(".git")
            elif len(groups) == 1:
                return groups[0], None
    return None, None


def fetch_github_user(username: str) -> dict | None:
    """Fetch GitHub user profile."""
    url = f"{GITHUB_API}/users/{username}"
    req = urllib.request.Request(url)
    req.add_header("Accept", "application/vnd.github.v3+json")
    req.add_header("User-Agent", "Cohort-Evaluator")

    try:
        response = urllib.request.urlopen(req, timeout=10)
        return json.loads(response.read().decode())
    except Exception:
        return None


def fetch_github_repos(username: str) -> list:
    """Fetch user's public repos."""
    url = f"{GITHUB_API}/users/{username}/repos?per_page=100&sort=updated"
    req = urllib.request.Request(url)
    req.add_header("Accept", "application/vnd.github.v3+json")
    req.add_header("User-Agent", "Cohort-Evaluator")

    try:
        response = urllib.request.urlopen(req, timeout=10)
        return json.loads(response.read().decode())
    except Exception:
        return []


def fetch_repo_languages(owner: str, repo: str) -> dict:
    """Fetch languages used in a repo."""
    url = f"{GITHUB_API}/repos/{owner}/{repo}/languages"
    req = urllib.request.Request(url)
    req.add_header("Accept", "application/vnd.github.v3+json")
    req.add_header("User-Agent", "Cohort-Evaluator")

    try:
        response = urllib.request.urlopen(req, timeout=10)
        return json.loads(response.read().decode())
    except Exception:
        return {}


def analyze_proficiency_text(text: str) -> tuple[str, list[str]]:
    """Analyze self-described proficiency text."""
    text_lower = text.lower()

    # Extract languages/technologies mentioned
    tech_patterns = [
        r"\b(python|java|javascript|typescript|c\+\+|c#|ruby|go|rust|swift|kotlin|php|perl|scala|r)\b",
        r"\b(react|angular|vue|node|express|django|flask|spring|rails|laravel)\b",
        r"\b(html|css|sass|scss|tailwind|bootstrap)\b",
        r"\b(sql|mysql|postgresql|mongodb|redis|firebase)\b",
        r"\b(docker|kubernetes|aws|azure|gcp|git)\b",
        r"\b(machine learning|ml|ai|data science|tensorflow|pytorch)\b",
    ]

    technologies = set()
    for pattern in tech_patterns:
        matches = re.findall(pattern, text_lower)
        technologies.update(matches)

    # Determine level from keywords
    beginner_keywords = [
        "beginner",
        "learning",
        "new to",
        "just started",
        "basic",
        "novice",
        "early",
    ]
    intermediate_keywords = [
        "intermediate",
        "comfortable",
        "familiar",
        "some experience",
        "worked with",
    ]
    advanced_keywords = [
        "advanced",
        "expert",
        "senior",
        "professional",
        "years of experience",
        "proficient",
        "strong",
    ]

    level = "intermediate"  # default

    if any(kw in text_lower for kw in advanced_keywords):
        level = "advanced"
    elif any(kw in text_lower for kw in beginner_keywords):
        level = "beginner"
    elif any(kw in text_lower for kw in intermediate_keywords):
        level = "intermediate"

    return level, list(technologies)


def evaluate_applicant(applicant: dict, index: int, total: int) -> dict:
    """Evaluate a single applicant's technical proficiency."""
    discord_id = applicant["discord_id"]
    project_url = applicant["project_url"]
    proficiency_self = applicant["proficiency_self"]
    project_reason = applicant["project_reason"]

    print(f"[{index + 1}/{total}] Evaluating {discord_id}...")

    result = {
        "discord_id": discord_id,
        "github_username": None,
        "github_repos_count": 0,
        "github_followers": 0,
        "languages_from_github": [],
        "languages_from_text": [],
        "self_described_level": None,
        "final_proficiency": "intermediate",  # default
        "tech_stack": [],
        "notes": [],
    }

    # Analyze self-description
    text_level, text_techs = analyze_proficiency_text(
        proficiency_self + " " + project_reason
    )
    result["self_described_level"] = text_level
    result["languages_from_text"] = text_techs

    # Fetch GitHub data if URL provided
    if project_url and "github.com" in project_url:
        owner, repo = extract_github_info(project_url)

        if owner:
            result["github_username"] = owner

            # Fetch user profile
            user_data = fetch_github_user(owner)
            if user_data:
                result["github_repos_count"] = user_data.get("public_repos", 0)
                result["github_followers"] = user_data.get("followers", 0)

            # Fetch repos to get languages
            repos = fetch_github_repos(owner)
            all_languages = set()
            for r in repos[:10]:  # Check top 10 repos
                if r.get("language"):
                    all_languages.add(r["language"].lower())
            result["languages_from_github"] = list(all_languages)

            # If specific repo provided, get its languages
            if repo:
                repo_langs = fetch_repo_languages(owner, repo)
                for lang in repo_langs:
                    all_languages.add(lang.lower())
                result["languages_from_github"] = list(all_languages)

            time.sleep(0.5)  # Rate limiting

    # Combine tech stack
    all_tech = set(result["languages_from_github"]) | set(result["languages_from_text"])
    result["tech_stack"] = sorted(all_tech)

    # Determine final proficiency
    # Factors: self-description, GitHub activity, tech diversity
    github_score = 0
    if result["github_repos_count"] >= 20:
        github_score += 2
    elif result["github_repos_count"] >= 10:
        github_score += 1

    if result["github_followers"] >= 50:
        github_score += 2
    elif result["github_followers"] >= 10:
        github_score += 1

    tech_count = len(result["tech_stack"])
    if tech_count >= 6:
        github_score += 2
    elif tech_count >= 3:
        github_score += 1

    # Map self-described level to score
    level_scores = {"beginner": 0, "intermediate": 2, "advanced": 4}
    self_score = level_scores.get(text_level, 2)

    # Combined score
    total_score = github_score + self_score

    if total_score >= 7:
        result["final_proficiency"] = "advanced"
    elif total_score >= 3:
        result["final_proficiency"] = "intermediate"
    else:
        result["final_proficiency"] = "beginner"

    # Add notes
    if not project_url or "github.com" not in project_url:
        result["notes"].append("No GitHub URL provided")
    if result["github_repos_count"] == 0 and result["github_username"]:
        result["notes"].append("GitHub profile has no public repos")

    return result


def main():
    # Load applicants
    with open(DATA_DIR / "applicants_to_evaluate.json") as f:
        applicants = json.load(f)

    print(f"Evaluating {len(applicants)} applicants...\n")

    evaluations = []
    for i, applicant in enumerate(applicants):
        result = evaluate_applicant(applicant, i, len(applicants))
        evaluations.append(result)

        # Progress update every 10
        if (i + 1) % 10 == 0:
            print(f"  Progress: {i + 1}/{len(applicants)} complete")

    # Save results
    with open(DATA_DIR / "proficiency_evaluations.json", "w") as f:
        json.dump(evaluations, f, indent=2)

    # Summary
    beginner = sum(1 for e in evaluations if e["final_proficiency"] == "beginner")
    intermediate = sum(
        1 for e in evaluations if e["final_proficiency"] == "intermediate"
    )
    advanced = sum(1 for e in evaluations if e["final_proficiency"] == "advanced")

    print("\n=== EVALUATION COMPLETE ===")
    print(f"Beginner: {beginner}")
    print(f"Intermediate: {intermediate}")
    print(f"Advanced: {advanced}")
    print(f"Total: {len(evaluations)}")
    print("\nResults saved to proficiency_evaluations.json")


if __name__ == "__main__":
    main()