"""Evaluate the technical proficiency of cohort applicants using their GitHub profiles. Fetches each applicant's public GitHub repositories and scores their proficiency as Beginner, Intermediate, or Advanced based on language variety, repo count, commit activity, and presence of certain technologies. Data files (place in data/): - applicants_to_evaluate.json List of applicants with GitHub usernames Outputs (written to data/): - proficiency_evaluations.json Proficiency scores and tech stacks per applicant Env vars: - None (uses public GitHub API; may be rate-limited without authentication) """ import json import re import time import urllib.error import urllib.request from pathlib import Path DATA_DIR = Path(__file__).parent.parent.parent / "data" # GitHub API (no auth needed for public repos, but rate limited) GITHUB_API = "https://api.github.com" def extract_github_info(url: str) -> tuple[str | None, str | None]: """Extract owner and repo from GitHub URL.""" # Handle various GitHub URL formats patterns = [ r"github\.com/([^/]+)/([^/\s?#]+)", # github.com/owner/repo r"github\.com/([^/\s?#]+)/?$", # github.com/owner (profile) ] for pattern in patterns: match = re.search(pattern, url) if match: groups = match.groups() if len(groups) == 2: return groups[0], groups[1].rstrip(".git") elif len(groups) == 1: return groups[0], None return None, None def fetch_github_user(username: str) -> dict | None: """Fetch GitHub user profile.""" url = f"{GITHUB_API}/users/{username}" req = urllib.request.Request(url) req.add_header("Accept", "application/vnd.github.v3+json") req.add_header("User-Agent", "Cohort-Evaluator") try: response = urllib.request.urlopen(req, timeout=10) return json.loads(response.read().decode()) except Exception: return None def fetch_github_repos(username: str) -> list: """Fetch user's public repos.""" url = f"{GITHUB_API}/users/{username}/repos?per_page=100&sort=updated" req = urllib.request.Request(url) req.add_header("Accept", "application/vnd.github.v3+json") req.add_header("User-Agent", "Cohort-Evaluator") try: response = urllib.request.urlopen(req, timeout=10) return json.loads(response.read().decode()) except Exception: return [] def fetch_repo_languages(owner: str, repo: str) -> dict: """Fetch languages used in a repo.""" url = f"{GITHUB_API}/repos/{owner}/{repo}/languages" req = urllib.request.Request(url) req.add_header("Accept", "application/vnd.github.v3+json") req.add_header("User-Agent", "Cohort-Evaluator") try: response = urllib.request.urlopen(req, timeout=10) return json.loads(response.read().decode()) except Exception: return {} def analyze_proficiency_text(text: str) -> tuple[str, list[str]]: """Analyze self-described proficiency text.""" text_lower = text.lower() # Extract languages/technologies mentioned tech_patterns = [ r"\b(python|java|javascript|typescript|c\+\+|c#|ruby|go|rust|swift|kotlin|php|perl|scala|r)\b", r"\b(react|angular|vue|node|express|django|flask|spring|rails|laravel)\b", r"\b(html|css|sass|scss|tailwind|bootstrap)\b", r"\b(sql|mysql|postgresql|mongodb|redis|firebase)\b", r"\b(docker|kubernetes|aws|azure|gcp|git)\b", r"\b(machine learning|ml|ai|data science|tensorflow|pytorch)\b", ] technologies = set() for pattern in tech_patterns: matches = re.findall(pattern, text_lower) technologies.update(matches) # Determine level from keywords beginner_keywords = [ "beginner", "learning", "new to", "just started", "basic", "novice", "early", ] intermediate_keywords = [ "intermediate", "comfortable", "familiar", "some experience", "worked with", ] advanced_keywords = [ "advanced", "expert", "senior", "professional", "years of experience", "proficient", "strong", ] level = "intermediate" # default if any(kw in text_lower for kw in advanced_keywords): level = "advanced" elif any(kw in text_lower for kw in beginner_keywords): level = "beginner" elif any(kw in text_lower for kw in intermediate_keywords): level = "intermediate" return level, list(technologies) def evaluate_applicant(applicant: dict, index: int, total: int) -> dict: """Evaluate a single applicant's technical proficiency.""" discord_id = applicant["discord_id"] project_url = applicant["project_url"] proficiency_self = applicant["proficiency_self"] project_reason = applicant["project_reason"] print(f"[{index + 1}/{total}] Evaluating {discord_id}...") result = { "discord_id": discord_id, "github_username": None, "github_repos_count": 0, "github_followers": 0, "languages_from_github": [], "languages_from_text": [], "self_described_level": None, "final_proficiency": "intermediate", # default "tech_stack": [], "notes": [], } # Analyze self-description text_level, text_techs = analyze_proficiency_text( proficiency_self + " " + project_reason ) result["self_described_level"] = text_level result["languages_from_text"] = text_techs # Fetch GitHub data if URL provided if project_url and "github.com" in project_url: owner, repo = extract_github_info(project_url) if owner: result["github_username"] = owner # Fetch user profile user_data = fetch_github_user(owner) if user_data: result["github_repos_count"] = user_data.get("public_repos", 0) result["github_followers"] = user_data.get("followers", 0) # Fetch repos to get languages repos = fetch_github_repos(owner) all_languages = set() for r in repos[:10]: # Check top 10 repos if r.get("language"): all_languages.add(r["language"].lower()) result["languages_from_github"] = list(all_languages) # If specific repo provided, get its languages if repo: repo_langs = fetch_repo_languages(owner, repo) for lang in repo_langs: all_languages.add(lang.lower()) result["languages_from_github"] = list(all_languages) time.sleep(0.5) # Rate limiting # Combine tech stack all_tech = set(result["languages_from_github"]) | set(result["languages_from_text"]) result["tech_stack"] = sorted(all_tech) # Determine final proficiency # Factors: self-description, GitHub activity, tech diversity github_score = 0 if result["github_repos_count"] >= 20: github_score += 2 elif result["github_repos_count"] >= 10: github_score += 1 if result["github_followers"] >= 50: github_score += 2 elif result["github_followers"] >= 10: github_score += 1 tech_count = len(result["tech_stack"]) if tech_count >= 6: github_score += 2 elif tech_count >= 3: github_score += 1 # Map self-described level to score level_scores = {"beginner": 0, "intermediate": 2, "advanced": 4} self_score = level_scores.get(text_level, 2) # Combined score total_score = github_score + self_score if total_score >= 7: result["final_proficiency"] = "advanced" elif total_score >= 3: result["final_proficiency"] = "intermediate" else: result["final_proficiency"] = "beginner" # Add notes if not project_url or "github.com" not in project_url: result["notes"].append("No GitHub URL provided") if result["github_repos_count"] == 0 and result["github_username"]: result["notes"].append("GitHub profile has no public repos") return result def main(): # Load applicants with open(DATA_DIR / "applicants_to_evaluate.json") as f: applicants = json.load(f) print(f"Evaluating {len(applicants)} applicants...\n") evaluations = [] for i, applicant in enumerate(applicants): result = evaluate_applicant(applicant, i, len(applicants)) evaluations.append(result) # Progress update every 10 if (i + 1) % 10 == 0: print(f" Progress: {i + 1}/{len(applicants)} complete") # Save results with open(DATA_DIR / "proficiency_evaluations.json", "w") as f: json.dump(evaluations, f, indent=2) # Summary beginner = sum(1 for e in evaluations if e["final_proficiency"] == "beginner") intermediate = sum( 1 for e in evaluations if e["final_proficiency"] == "intermediate" ) advanced = sum(1 for e in evaluations if e["final_proficiency"] == "advanced") print("\n=== EVALUATION COMPLETE ===") print(f"Beginner: {beginner}") print(f"Intermediate: {intermediate}") print(f"Advanced: {advanced}") print(f"Total: {len(evaluations)}") print("\nResults saved to proficiency_evaluations.json") if __name__ == "__main__": main()