import json import re import time import urllib.error import urllib.request # GitHub API (no auth needed for public repos, but rate limited) GITHUB_API = "https://api.github.com" def extract_github_info(url: str) -> tuple[str | None, str | None]: """Extract owner and repo from GitHub URL.""" # Handle various GitHub URL formats patterns = [ r"github\.com/([^/]+)/([^/\s?#]+)", # github.com/owner/repo r"github\.com/([^/\s?#]+)/?$", # github.com/owner (profile) ] for pattern in patterns: match = re.search(pattern, url) if match: groups = match.groups() if len(groups) == 2: return groups[0], groups[1].rstrip(".git") elif len(groups) == 1: return groups[0], None return None, None def fetch_github_user(username: str) -> dict | None: """Fetch GitHub user profile.""" url = f"{GITHUB_API}/users/{username}" req = urllib.request.Request(url) req.add_header("Accept", "application/vnd.github.v3+json") req.add_header("User-Agent", "Cohort-Evaluator") try: response = urllib.request.urlopen(req, timeout=10) return json.loads(response.read().decode()) except Exception: return None def fetch_github_repos(username: str) -> list: """Fetch user's public repos.""" url = f"{GITHUB_API}/users/{username}/repos?per_page=100&sort=updated" req = urllib.request.Request(url) req.add_header("Accept", "application/vnd.github.v3+json") req.add_header("User-Agent", "Cohort-Evaluator") try: response = urllib.request.urlopen(req, timeout=10) return json.loads(response.read().decode()) except Exception: return [] def fetch_repo_languages(owner: str, repo: str) -> dict: """Fetch languages used in a repo.""" url = f"{GITHUB_API}/repos/{owner}/{repo}/languages" req = urllib.request.Request(url) req.add_header("Accept", "application/vnd.github.v3+json") req.add_header("User-Agent", "Cohort-Evaluator") try: response = urllib.request.urlopen(req, timeout=10) return json.loads(response.read().decode()) except Exception: return {} def analyze_proficiency_text(text: str) -> tuple[str, list[str]]: """Analyze self-described proficiency text.""" text_lower = text.lower() # Extract languages/technologies mentioned tech_patterns = [ r"\b(python|java|javascript|typescript|c\+\+|c#|ruby|go|rust|swift|kotlin|php|perl|scala|r)\b", r"\b(react|angular|vue|node|express|django|flask|spring|rails|laravel)\b", r"\b(html|css|sass|scss|tailwind|bootstrap)\b", r"\b(sql|mysql|postgresql|mongodb|redis|firebase)\b", r"\b(docker|kubernetes|aws|azure|gcp|git)\b", r"\b(machine learning|ml|ai|data science|tensorflow|pytorch)\b", ] technologies = set() for pattern in tech_patterns: matches = re.findall(pattern, text_lower) technologies.update(matches) # Determine level from keywords beginner_keywords = [ "beginner", "learning", "new to", "just started", "basic", "novice", "early", ] intermediate_keywords = [ "intermediate", "comfortable", "familiar", "some experience", "worked with", ] advanced_keywords = [ "advanced", "expert", "senior", "professional", "years of experience", "proficient", "strong", ] level = "intermediate" # default if any(kw in text_lower for kw in advanced_keywords): level = "advanced" elif any(kw in text_lower for kw in beginner_keywords): level = "beginner" elif any(kw in text_lower for kw in intermediate_keywords): level = "intermediate" return level, list(technologies) def evaluate_applicant(applicant: dict, index: int, total: int) -> dict: """Evaluate a single applicant's technical proficiency.""" discord_id = applicant["discord_id"] project_url = applicant["project_url"] proficiency_self = applicant["proficiency_self"] project_reason = applicant["project_reason"] print(f"[{index + 1}/{total}] Evaluating {discord_id}...") result = { "discord_id": discord_id, "github_username": None, "github_repos_count": 0, "github_followers": 0, "languages_from_github": [], "languages_from_text": [], "self_described_level": None, "final_proficiency": "intermediate", # default "tech_stack": [], "notes": [], } # Analyze self-description text_level, text_techs = analyze_proficiency_text( proficiency_self + " " + project_reason ) result["self_described_level"] = text_level result["languages_from_text"] = text_techs # Fetch GitHub data if URL provided if project_url and "github.com" in project_url: owner, repo = extract_github_info(project_url) if owner: result["github_username"] = owner # Fetch user profile user_data = fetch_github_user(owner) if user_data: result["github_repos_count"] = user_data.get("public_repos", 0) result["github_followers"] = user_data.get("followers", 0) # Fetch repos to get languages repos = fetch_github_repos(owner) all_languages = set() for r in repos[:10]: # Check top 10 repos if r.get("language"): all_languages.add(r["language"].lower()) result["languages_from_github"] = list(all_languages) # If specific repo provided, get its languages if repo: repo_langs = fetch_repo_languages(owner, repo) for lang in repo_langs: all_languages.add(lang.lower()) result["languages_from_github"] = list(all_languages) time.sleep(0.5) # Rate limiting # Combine tech stack all_tech = set(result["languages_from_github"]) | set(result["languages_from_text"]) result["tech_stack"] = sorted(all_tech) # Determine final proficiency # Factors: self-description, GitHub activity, tech diversity github_score = 0 if result["github_repos_count"] >= 20: github_score += 2 elif result["github_repos_count"] >= 10: github_score += 1 if result["github_followers"] >= 50: github_score += 2 elif result["github_followers"] >= 10: github_score += 1 tech_count = len(result["tech_stack"]) if tech_count >= 6: github_score += 2 elif tech_count >= 3: github_score += 1 # Map self-described level to score level_scores = {"beginner": 0, "intermediate": 2, "advanced": 4} self_score = level_scores.get(text_level, 2) # Combined score total_score = github_score + self_score if total_score >= 7: result["final_proficiency"] = "advanced" elif total_score >= 3: result["final_proficiency"] = "intermediate" else: result["final_proficiency"] = "beginner" # Add notes if not project_url or "github.com" not in project_url: result["notes"].append("No GitHub URL provided") if result["github_repos_count"] == 0 and result["github_username"]: result["notes"].append("GitHub profile has no public repos") return result def main(): # Load applicants with open("applicants_to_evaluate.json") as f: applicants = json.load(f) print(f"Evaluating {len(applicants)} applicants...\n") evaluations = [] for i, applicant in enumerate(applicants): result = evaluate_applicant(applicant, i, len(applicants)) evaluations.append(result) # Progress update every 10 if (i + 1) % 10 == 0: print(f" Progress: {i + 1}/{len(applicants)} complete") # Save results with open("proficiency_evaluations.json", "w") as f: json.dump(evaluations, f, indent=2) # Summary beginner = sum(1 for e in evaluations if e["final_proficiency"] == "beginner") intermediate = sum( 1 for e in evaluations if e["final_proficiency"] == "intermediate" ) advanced = sum(1 for e in evaluations if e["final_proficiency"] == "advanced") print("\n=== EVALUATION COMPLETE ===") print(f"Beginner: {beginner}") print(f"Intermediate: {intermediate}") print(f"Advanced: {advanced}") print(f"Total: {len(evaluations)}") print("\nResults saved to proficiency_evaluations.json") if __name__ == "__main__": main()