feat: add multi-lang support and cohort scripts (#1)

### Explanation _No response_ ### Issue _No response_ ### Attestations - [ ] I have read and agree to the [Code of Conduct](https://docs.nhcarrigan.com/community/coc/) - [ ] I have read and agree to the [Community Guidelines](https://docs.nhcarrigan.com/community/guide/). - [ ] My contribution complies with the [Contributor Covenant](https://docs.nhcarrigan.com/dev/covenant/). ### Dependencies - [ ] I have pinned the dependencies to a specific patch version. ### Style - [ ] I have run the linter and resolved any errors. - [ ] My pull request uses an appropriate title, matching the conventional commit standards. - [ ] My scope of feat/fix/chore/etc. correctly matches the nature of changes in my pull request. ### Tests - [ ] My contribution adds new code, and I have added tests to cover it. - [ ] My contribution modifies existing code, and I have updated the tests to reflect these changes. - [ ] All new and existing tests pass locally with my changes. - [ ] Code coverage remains at or above the configured threshold. ### Documentation _No response_ ### Versioning _No response_ Co-authored-by: Hikari <hikari@nhcarrigan.com> Reviewed-on: #1 Co-authored-by: Naomi Carrigan <commits@nhcarrigan.com> Co-committed-by: Naomi Carrigan <commits@nhcarrigan.com>
2026-01-23 20:07:16 -08:00
parent 38e7f15d93
commit 6b5fa40599
59 changed files with 2249 additions and 48 deletions
@@ -0,0 +1,271 @@
+import json
+import re
+import time
+import urllib.error
+import urllib.request
+
+# GitHub API (no auth needed for public repos, but rate limited)
+GITHUB_API = "https://api.github.com"
+
+
+def extract_github_info(url: str) -> tuple[str | None, str | None]:
+    """Extract owner and repo from GitHub URL."""
+    # Handle various GitHub URL formats
+    patterns = [
+        r"github\.com/([^/]+)/([^/\s?#]+)",  # github.com/owner/repo
+        r"github\.com/([^/\s?#]+)/?$",  # github.com/owner (profile)
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            groups = match.groups()
+            if len(groups) == 2:
+                return groups[0], groups[1].rstrip(".git")
+            elif len(groups) == 1:
+                return groups[0], None
+    return None, None
+
+
+def fetch_github_user(username: str) -> dict | None:
+    """Fetch GitHub user profile."""
+    url = f"{GITHUB_API}/users/{username}"
+    req = urllib.request.Request(url)
+    req.add_header("Accept", "application/vnd.github.v3+json")
+    req.add_header("User-Agent", "Cohort-Evaluator")
+
+    try:
+        response = urllib.request.urlopen(req, timeout=10)
+        return json.loads(response.read().decode())
+    except Exception:
+        return None
+
+
+def fetch_github_repos(username: str) -> list:
+    """Fetch user's public repos."""
+    url = f"{GITHUB_API}/users/{username}/repos?per_page=100&sort=updated"
+    req = urllib.request.Request(url)
+    req.add_header("Accept", "application/vnd.github.v3+json")
+    req.add_header("User-Agent", "Cohort-Evaluator")
+
+    try:
+        response = urllib.request.urlopen(req, timeout=10)
+        return json.loads(response.read().decode())
+    except Exception:
+        return []
+
+
+def fetch_repo_languages(owner: str, repo: str) -> dict:
+    """Fetch languages used in a repo."""
+    url = f"{GITHUB_API}/repos/{owner}/{repo}/languages"
+    req = urllib.request.Request(url)
+    req.add_header("Accept", "application/vnd.github.v3+json")
+    req.add_header("User-Agent", "Cohort-Evaluator")
+
+    try:
+        response = urllib.request.urlopen(req, timeout=10)
+        return json.loads(response.read().decode())
+    except Exception:
+        return {}
+
+
+def analyze_proficiency_text(text: str) -> tuple[str, list[str]]:
+    """Analyze self-described proficiency text."""
+    text_lower = text.lower()
+
+    # Extract languages/technologies mentioned
+    tech_patterns = [
+        r"\b(python|java|javascript|typescript|c\+\+|c#|ruby|go|rust|swift|kotlin|php|perl|scala|r)\b",
+        r"\b(react|angular|vue|node|express|django|flask|spring|rails|laravel)\b",
+        r"\b(html|css|sass|scss|tailwind|bootstrap)\b",
+        r"\b(sql|mysql|postgresql|mongodb|redis|firebase)\b",
+        r"\b(docker|kubernetes|aws|azure|gcp|git)\b",
+        r"\b(machine learning|ml|ai|data science|tensorflow|pytorch)\b",
+    ]
+
+    technologies = set()
+    for pattern in tech_patterns:
+        matches = re.findall(pattern, text_lower)
+        technologies.update(matches)
+
+    # Determine level from keywords
+    beginner_keywords = [
+        "beginner",
+        "learning",
+        "new to",
+        "just started",
+        "basic",
+        "novice",
+        "early",
+    ]
+    intermediate_keywords = [
+        "intermediate",
+        "comfortable",
+        "familiar",
+        "some experience",
+        "worked with",
+    ]
+    advanced_keywords = [
+        "advanced",
+        "expert",
+        "senior",
+        "professional",
+        "years of experience",
+        "proficient",
+        "strong",
+    ]
+
+    level = "intermediate"  # default
+
+    if any(kw in text_lower for kw in advanced_keywords):
+        level = "advanced"
+    elif any(kw in text_lower for kw in beginner_keywords):
+        level = "beginner"
+    elif any(kw in text_lower for kw in intermediate_keywords):
+        level = "intermediate"
+
+    return level, list(technologies)
+
+
+def evaluate_applicant(applicant: dict, index: int, total: int) -> dict:
+    """Evaluate a single applicant's technical proficiency."""
+    discord_id = applicant["discord_id"]
+    project_url = applicant["project_url"]
+    proficiency_self = applicant["proficiency_self"]
+    project_reason = applicant["project_reason"]
+
+    print(f"[{index + 1}/{total}] Evaluating {discord_id}...")
+
+    result = {
+        "discord_id": discord_id,
+        "github_username": None,
+        "github_repos_count": 0,
+        "github_followers": 0,
+        "languages_from_github": [],
+        "languages_from_text": [],
+        "self_described_level": None,
+        "final_proficiency": "intermediate",  # default
+        "tech_stack": [],
+        "notes": [],
+    }
+
+    # Analyze self-description
+    text_level, text_techs = analyze_proficiency_text(
+        proficiency_self + " " + project_reason
+    )
+    result["self_described_level"] = text_level
+    result["languages_from_text"] = text_techs
+
+    # Fetch GitHub data if URL provided
+    if project_url and "github.com" in project_url:
+        owner, repo = extract_github_info(project_url)
+
+        if owner:
+            result["github_username"] = owner
+
+            # Fetch user profile
+            user_data = fetch_github_user(owner)
+            if user_data:
+                result["github_repos_count"] = user_data.get("public_repos", 0)
+                result["github_followers"] = user_data.get("followers", 0)
+
+            # Fetch repos to get languages
+            repos = fetch_github_repos(owner)
+            all_languages = set()
+            for r in repos[:10]:  # Check top 10 repos
+                if r.get("language"):
+                    all_languages.add(r["language"].lower())
+            result["languages_from_github"] = list(all_languages)
+
+            # If specific repo provided, get its languages
+            if repo:
+                repo_langs = fetch_repo_languages(owner, repo)
+                for lang in repo_langs:
+                    all_languages.add(lang.lower())
+                result["languages_from_github"] = list(all_languages)
+
+            time.sleep(0.5)  # Rate limiting
+
+    # Combine tech stack
+    all_tech = set(result["languages_from_github"]) | set(result["languages_from_text"])
+    result["tech_stack"] = sorted(all_tech)
+
+    # Determine final proficiency
+    # Factors: self-description, GitHub activity, tech diversity
+    github_score = 0
+    if result["github_repos_count"] >= 20:
+        github_score += 2
+    elif result["github_repos_count"] >= 10:
+        github_score += 1
+
+    if result["github_followers"] >= 50:
+        github_score += 2
+    elif result["github_followers"] >= 10:
+        github_score += 1
+
+    tech_count = len(result["tech_stack"])
+    if tech_count >= 6:
+        github_score += 2
+    elif tech_count >= 3:
+        github_score += 1
+
+    # Map self-described level to score
+    level_scores = {"beginner": 0, "intermediate": 2, "advanced": 4}
+    self_score = level_scores.get(text_level, 2)
+
+    # Combined score
+    total_score = github_score + self_score
+
+    if total_score >= 7:
+        result["final_proficiency"] = "advanced"
+    elif total_score >= 3:
+        result["final_proficiency"] = "intermediate"
+    else:
+        result["final_proficiency"] = "beginner"
+
+    # Add notes
+    if not project_url or "github.com" not in project_url:
+        result["notes"].append("No GitHub URL provided")
+    if result["github_repos_count"] == 0 and result["github_username"]:
+        result["notes"].append("GitHub profile has no public repos")
+
+    return result
+
+
+def main():
+    # Load applicants
+    with open("applicants_to_evaluate.json") as f:
+        applicants = json.load(f)
+
+    print(f"Evaluating {len(applicants)} applicants...\n")
+
+    evaluations = []
+    for i, applicant in enumerate(applicants):
+        result = evaluate_applicant(applicant, i, len(applicants))
+        evaluations.append(result)
+
+        # Progress update every 10
+        if (i + 1) % 10 == 0:
+            print(f"  Progress: {i + 1}/{len(applicants)} complete")
+
+    # Save results
+    with open("proficiency_evaluations.json", "w") as f:
+        json.dump(evaluations, f, indent=2)
+
+    # Summary
+    beginner = sum(1 for e in evaluations if e["final_proficiency"] == "beginner")
+    intermediate = sum(
+        1 for e in evaluations if e["final_proficiency"] == "intermediate"
+    )
+    advanced = sum(1 for e in evaluations if e["final_proficiency"] == "advanced")
+
+    print("\n=== EVALUATION COMPLETE ===")
+    print(f"Beginner: {beginner}")
+    print(f"Intermediate: {intermediate}")
+    print(f"Advanced: {advanced}")
+    print(f"Total: {len(evaluations)}")
+    print("\nResults saved to proficiency_evaluations.json")
+
+
+if __name__ == "__main__":
+    main()