ephemere/python/cohort/evaluate_technical_proficiency.py

import json
import re
import time
import urllib.error
import urllib.request

# GitHub API (no auth needed for public repos, but rate limited)
GITHUB_API = "https://api.github.com"


def extract_github_info(url: str) -> tuple[str | None, str | None]:
    """Extract owner and repo from GitHub URL."""
    # Handle various GitHub URL formats
    patterns = [
        r"github\.com/([^/]+)/([^/\s?#]+)",  # github.com/owner/repo
        r"github\.com/([^/\s?#]+)/?$",  # github.com/owner (profile)
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            groups = match.groups()
            if len(groups) == 2:
                return groups[0], groups[1].rstrip(".git")
            elif len(groups) == 1:
                return groups[0], None
    return None, None


def fetch_github_user(username: str) -> dict | None:
    """Fetch GitHub user profile."""
    url = f"{GITHUB_API}/users/{username}"
    req = urllib.request.Request(url)
    req.add_header("Accept", "application/vnd.github.v3+json")
    req.add_header("User-Agent", "Cohort-Evaluator")

    try:
        response = urllib.request.urlopen(req, timeout=10)
        return json.loads(response.read().decode())
    except Exception:
        return None


def fetch_github_repos(username: str) -> list:
    """Fetch user's public repos."""
    url = f"{GITHUB_API}/users/{username}/repos?per_page=100&sort=updated"
    req = urllib.request.Request(url)
    req.add_header("Accept", "application/vnd.github.v3+json")
    req.add_header("User-Agent", "Cohort-Evaluator")

    try:
        response = urllib.request.urlopen(req, timeout=10)
        return json.loads(response.read().decode())
    except Exception:
        return []


def fetch_repo_languages(owner: str, repo: str) -> dict:
    """Fetch languages used in a repo."""
    url = f"{GITHUB_API}/repos/{owner}/{repo}/languages"
    req = urllib.request.Request(url)
    req.add_header("Accept", "application/vnd.github.v3+json")
    req.add_header("User-Agent", "Cohort-Evaluator")

    try:
        response = urllib.request.urlopen(req, timeout=10)
        return json.loads(response.read().decode())
    except Exception:
        return {}


def analyze_proficiency_text(text: str) -> tuple[str, list[str]]:
    """Analyze self-described proficiency text."""
    text_lower = text.lower()

    # Extract languages/technologies mentioned
    tech_patterns = [
        r"\b(python|java|javascript|typescript|c\+\+|c#|ruby|go|rust|swift|kotlin|php|perl|scala|r)\b",
        r"\b(react|angular|vue|node|express|django|flask|spring|rails|laravel)\b",
        r"\b(html|css|sass|scss|tailwind|bootstrap)\b",
        r"\b(sql|mysql|postgresql|mongodb|redis|firebase)\b",
        r"\b(docker|kubernetes|aws|azure|gcp|git)\b",
        r"\b(machine learning|ml|ai|data science|tensorflow|pytorch)\b",
    ]

    technologies = set()
    for pattern in tech_patterns:
        matches = re.findall(pattern, text_lower)
        technologies.update(matches)

    # Determine level from keywords
    beginner_keywords = [
        "beginner",
        "learning",
        "new to",
        "just started",
        "basic",
        "novice",
        "early",
    ]
    intermediate_keywords = [
        "intermediate",
        "comfortable",
        "familiar",
        "some experience",
        "worked with",
    ]
    advanced_keywords = [
        "advanced",
        "expert",
        "senior",
        "professional",
        "years of experience",
        "proficient",
        "strong",
    ]

    level = "intermediate"  # default

    if any(kw in text_lower for kw in advanced_keywords):
        level = "advanced"
    elif any(kw in text_lower for kw in beginner_keywords):
        level = "beginner"
    elif any(kw in text_lower for kw in intermediate_keywords):
        level = "intermediate"

    return level, list(technologies)


def evaluate_applicant(applicant: dict, index: int, total: int) -> dict:
    """Evaluate a single applicant's technical proficiency."""
    discord_id = applicant["discord_id"]
    project_url = applicant["project_url"]
    proficiency_self = applicant["proficiency_self"]
    project_reason = applicant["project_reason"]

    print(f"[{index + 1}/{total}] Evaluating {discord_id}...")

    result = {
        "discord_id": discord_id,
        "github_username": None,
        "github_repos_count": 0,
        "github_followers": 0,
        "languages_from_github": [],
        "languages_from_text": [],
        "self_described_level": None,
        "final_proficiency": "intermediate",  # default
        "tech_stack": [],
        "notes": [],
    }

    # Analyze self-description
    text_level, text_techs = analyze_proficiency_text(
        proficiency_self + " " + project_reason
    )
    result["self_described_level"] = text_level
    result["languages_from_text"] = text_techs

    # Fetch GitHub data if URL provided
    if project_url and "github.com" in project_url:
        owner, repo = extract_github_info(project_url)

        if owner:
            result["github_username"] = owner

            # Fetch user profile
            user_data = fetch_github_user(owner)
            if user_data:
                result["github_repos_count"] = user_data.get("public_repos", 0)
                result["github_followers"] = user_data.get("followers", 0)

            # Fetch repos to get languages
            repos = fetch_github_repos(owner)
            all_languages = set()
            for r in repos[:10]:  # Check top 10 repos
                if r.get("language"):
                    all_languages.add(r["language"].lower())
            result["languages_from_github"] = list(all_languages)

            # If specific repo provided, get its languages
            if repo:
                repo_langs = fetch_repo_languages(owner, repo)
                for lang in repo_langs:
                    all_languages.add(lang.lower())
                result["languages_from_github"] = list(all_languages)

            time.sleep(0.5)  # Rate limiting

    # Combine tech stack
    all_tech = set(result["languages_from_github"]) | set(result["languages_from_text"])
    result["tech_stack"] = sorted(all_tech)

    # Determine final proficiency
    # Factors: self-description, GitHub activity, tech diversity
    github_score = 0
    if result["github_repos_count"] >= 20:
        github_score += 2
    elif result["github_repos_count"] >= 10:
        github_score += 1

    if result["github_followers"] >= 50:
        github_score += 2
    elif result["github_followers"] >= 10:
        github_score += 1

    tech_count = len(result["tech_stack"])
    if tech_count >= 6:
        github_score += 2
    elif tech_count >= 3:
        github_score += 1

    # Map self-described level to score
    level_scores = {"beginner": 0, "intermediate": 2, "advanced": 4}
    self_score = level_scores.get(text_level, 2)

    # Combined score
    total_score = github_score + self_score

    if total_score >= 7:
        result["final_proficiency"] = "advanced"
    elif total_score >= 3:
        result["final_proficiency"] = "intermediate"
    else:
        result["final_proficiency"] = "beginner"

    # Add notes
    if not project_url or "github.com" not in project_url:
        result["notes"].append("No GitHub URL provided")
    if result["github_repos_count"] == 0 and result["github_username"]:
        result["notes"].append("GitHub profile has no public repos")

    return result


def main():
    # Load applicants
    with open("applicants_to_evaluate.json") as f:
        applicants = json.load(f)

    print(f"Evaluating {len(applicants)} applicants...\n")

    evaluations = []
    for i, applicant in enumerate(applicants):
        result = evaluate_applicant(applicant, i, len(applicants))
        evaluations.append(result)

        # Progress update every 10
        if (i + 1) % 10 == 0:
            print(f"  Progress: {i + 1}/{len(applicants)} complete")

    # Save results
    with open("proficiency_evaluations.json", "w") as f:
        json.dump(evaluations, f, indent=2)

    # Summary
    beginner = sum(1 for e in evaluations if e["final_proficiency"] == "beginner")
    intermediate = sum(
        1 for e in evaluations if e["final_proficiency"] == "intermediate"
    )
    advanced = sum(1 for e in evaluations if e["final_proficiency"] == "advanced")

    print("\n=== EVALUATION COMPLETE ===")
    print(f"Beginner: {beginner}")
    print(f"Intermediate: {intermediate}")
    print(f"Advanced: {advanced}")
    print(f"Total: {len(evaluations)}")
    print("\nResults saved to proficiency_evaluations.json")


if __name__ == "__main__":
    main()