generated from nhcarrigan/template
a40188413a
All Python cohort scripts now use DATA_DIR = Path(__file__).parent.parent.parent / "data" to correctly resolve the repo-root data/ directory regardless of the working directory set by run.sh. All TypeScript scripts have expanded JSDoc headers documenting data file requirements and environment variables.
291 lines
9.2 KiB
Python
291 lines
9.2 KiB
Python
"""Evaluate the technical proficiency of cohort applicants using their GitHub profiles.
|
|
|
|
Fetches each applicant's public GitHub repositories and scores their proficiency as
|
|
Beginner, Intermediate, or Advanced based on language variety, repo count, commit
|
|
activity, and presence of certain technologies.
|
|
|
|
Data files (place in data/):
|
|
- applicants_to_evaluate.json List of applicants with GitHub usernames
|
|
|
|
Outputs (written to data/):
|
|
- proficiency_evaluations.json Proficiency scores and tech stacks per applicant
|
|
|
|
Env vars:
|
|
- None (uses public GitHub API; may be rate-limited without authentication)
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
import urllib.error
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
|
|
|
# GitHub API (no auth needed for public repos, but rate limited)
|
|
GITHUB_API = "https://api.github.com"
|
|
|
|
|
|
def extract_github_info(url: str) -> tuple[str | None, str | None]:
|
|
"""Extract owner and repo from GitHub URL."""
|
|
# Handle various GitHub URL formats
|
|
patterns = [
|
|
r"github\.com/([^/]+)/([^/\s?#]+)", # github.com/owner/repo
|
|
r"github\.com/([^/\s?#]+)/?$", # github.com/owner (profile)
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
groups = match.groups()
|
|
if len(groups) == 2:
|
|
return groups[0], groups[1].rstrip(".git")
|
|
elif len(groups) == 1:
|
|
return groups[0], None
|
|
return None, None
|
|
|
|
|
|
def fetch_github_user(username: str) -> dict | None:
|
|
"""Fetch GitHub user profile."""
|
|
url = f"{GITHUB_API}/users/{username}"
|
|
req = urllib.request.Request(url)
|
|
req.add_header("Accept", "application/vnd.github.v3+json")
|
|
req.add_header("User-Agent", "Cohort-Evaluator")
|
|
|
|
try:
|
|
response = urllib.request.urlopen(req, timeout=10)
|
|
return json.loads(response.read().decode())
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def fetch_github_repos(username: str) -> list:
|
|
"""Fetch user's public repos."""
|
|
url = f"{GITHUB_API}/users/{username}/repos?per_page=100&sort=updated"
|
|
req = urllib.request.Request(url)
|
|
req.add_header("Accept", "application/vnd.github.v3+json")
|
|
req.add_header("User-Agent", "Cohort-Evaluator")
|
|
|
|
try:
|
|
response = urllib.request.urlopen(req, timeout=10)
|
|
return json.loads(response.read().decode())
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def fetch_repo_languages(owner: str, repo: str) -> dict:
|
|
"""Fetch languages used in a repo."""
|
|
url = f"{GITHUB_API}/repos/{owner}/{repo}/languages"
|
|
req = urllib.request.Request(url)
|
|
req.add_header("Accept", "application/vnd.github.v3+json")
|
|
req.add_header("User-Agent", "Cohort-Evaluator")
|
|
|
|
try:
|
|
response = urllib.request.urlopen(req, timeout=10)
|
|
return json.loads(response.read().decode())
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
def analyze_proficiency_text(text: str) -> tuple[str, list[str]]:
|
|
"""Analyze self-described proficiency text."""
|
|
text_lower = text.lower()
|
|
|
|
# Extract languages/technologies mentioned
|
|
tech_patterns = [
|
|
r"\b(python|java|javascript|typescript|c\+\+|c#|ruby|go|rust|swift|kotlin|php|perl|scala|r)\b",
|
|
r"\b(react|angular|vue|node|express|django|flask|spring|rails|laravel)\b",
|
|
r"\b(html|css|sass|scss|tailwind|bootstrap)\b",
|
|
r"\b(sql|mysql|postgresql|mongodb|redis|firebase)\b",
|
|
r"\b(docker|kubernetes|aws|azure|gcp|git)\b",
|
|
r"\b(machine learning|ml|ai|data science|tensorflow|pytorch)\b",
|
|
]
|
|
|
|
technologies = set()
|
|
for pattern in tech_patterns:
|
|
matches = re.findall(pattern, text_lower)
|
|
technologies.update(matches)
|
|
|
|
# Determine level from keywords
|
|
beginner_keywords = [
|
|
"beginner",
|
|
"learning",
|
|
"new to",
|
|
"just started",
|
|
"basic",
|
|
"novice",
|
|
"early",
|
|
]
|
|
intermediate_keywords = [
|
|
"intermediate",
|
|
"comfortable",
|
|
"familiar",
|
|
"some experience",
|
|
"worked with",
|
|
]
|
|
advanced_keywords = [
|
|
"advanced",
|
|
"expert",
|
|
"senior",
|
|
"professional",
|
|
"years of experience",
|
|
"proficient",
|
|
"strong",
|
|
]
|
|
|
|
level = "intermediate" # default
|
|
|
|
if any(kw in text_lower for kw in advanced_keywords):
|
|
level = "advanced"
|
|
elif any(kw in text_lower for kw in beginner_keywords):
|
|
level = "beginner"
|
|
elif any(kw in text_lower for kw in intermediate_keywords):
|
|
level = "intermediate"
|
|
|
|
return level, list(technologies)
|
|
|
|
|
|
def evaluate_applicant(applicant: dict, index: int, total: int) -> dict:
|
|
"""Evaluate a single applicant's technical proficiency."""
|
|
discord_id = applicant["discord_id"]
|
|
project_url = applicant["project_url"]
|
|
proficiency_self = applicant["proficiency_self"]
|
|
project_reason = applicant["project_reason"]
|
|
|
|
print(f"[{index + 1}/{total}] Evaluating {discord_id}...")
|
|
|
|
result = {
|
|
"discord_id": discord_id,
|
|
"github_username": None,
|
|
"github_repos_count": 0,
|
|
"github_followers": 0,
|
|
"languages_from_github": [],
|
|
"languages_from_text": [],
|
|
"self_described_level": None,
|
|
"final_proficiency": "intermediate", # default
|
|
"tech_stack": [],
|
|
"notes": [],
|
|
}
|
|
|
|
# Analyze self-description
|
|
text_level, text_techs = analyze_proficiency_text(
|
|
proficiency_self + " " + project_reason
|
|
)
|
|
result["self_described_level"] = text_level
|
|
result["languages_from_text"] = text_techs
|
|
|
|
# Fetch GitHub data if URL provided
|
|
if project_url and "github.com" in project_url:
|
|
owner, repo = extract_github_info(project_url)
|
|
|
|
if owner:
|
|
result["github_username"] = owner
|
|
|
|
# Fetch user profile
|
|
user_data = fetch_github_user(owner)
|
|
if user_data:
|
|
result["github_repos_count"] = user_data.get("public_repos", 0)
|
|
result["github_followers"] = user_data.get("followers", 0)
|
|
|
|
# Fetch repos to get languages
|
|
repos = fetch_github_repos(owner)
|
|
all_languages = set()
|
|
for r in repos[:10]: # Check top 10 repos
|
|
if r.get("language"):
|
|
all_languages.add(r["language"].lower())
|
|
result["languages_from_github"] = list(all_languages)
|
|
|
|
# If specific repo provided, get its languages
|
|
if repo:
|
|
repo_langs = fetch_repo_languages(owner, repo)
|
|
for lang in repo_langs:
|
|
all_languages.add(lang.lower())
|
|
result["languages_from_github"] = list(all_languages)
|
|
|
|
time.sleep(0.5) # Rate limiting
|
|
|
|
# Combine tech stack
|
|
all_tech = set(result["languages_from_github"]) | set(result["languages_from_text"])
|
|
result["tech_stack"] = sorted(all_tech)
|
|
|
|
# Determine final proficiency
|
|
# Factors: self-description, GitHub activity, tech diversity
|
|
github_score = 0
|
|
if result["github_repos_count"] >= 20:
|
|
github_score += 2
|
|
elif result["github_repos_count"] >= 10:
|
|
github_score += 1
|
|
|
|
if result["github_followers"] >= 50:
|
|
github_score += 2
|
|
elif result["github_followers"] >= 10:
|
|
github_score += 1
|
|
|
|
tech_count = len(result["tech_stack"])
|
|
if tech_count >= 6:
|
|
github_score += 2
|
|
elif tech_count >= 3:
|
|
github_score += 1
|
|
|
|
# Map self-described level to score
|
|
level_scores = {"beginner": 0, "intermediate": 2, "advanced": 4}
|
|
self_score = level_scores.get(text_level, 2)
|
|
|
|
# Combined score
|
|
total_score = github_score + self_score
|
|
|
|
if total_score >= 7:
|
|
result["final_proficiency"] = "advanced"
|
|
elif total_score >= 3:
|
|
result["final_proficiency"] = "intermediate"
|
|
else:
|
|
result["final_proficiency"] = "beginner"
|
|
|
|
# Add notes
|
|
if not project_url or "github.com" not in project_url:
|
|
result["notes"].append("No GitHub URL provided")
|
|
if result["github_repos_count"] == 0 and result["github_username"]:
|
|
result["notes"].append("GitHub profile has no public repos")
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
# Load applicants
|
|
with open(DATA_DIR / "applicants_to_evaluate.json") as f:
|
|
applicants = json.load(f)
|
|
|
|
print(f"Evaluating {len(applicants)} applicants...\n")
|
|
|
|
evaluations = []
|
|
for i, applicant in enumerate(applicants):
|
|
result = evaluate_applicant(applicant, i, len(applicants))
|
|
evaluations.append(result)
|
|
|
|
# Progress update every 10
|
|
if (i + 1) % 10 == 0:
|
|
print(f" Progress: {i + 1}/{len(applicants)} complete")
|
|
|
|
# Save results
|
|
with open(DATA_DIR / "proficiency_evaluations.json", "w") as f:
|
|
json.dump(evaluations, f, indent=2)
|
|
|
|
# Summary
|
|
beginner = sum(1 for e in evaluations if e["final_proficiency"] == "beginner")
|
|
intermediate = sum(
|
|
1 for e in evaluations if e["final_proficiency"] == "intermediate"
|
|
)
|
|
advanced = sum(1 for e in evaluations if e["final_proficiency"] == "advanced")
|
|
|
|
print("\n=== EVALUATION COMPLETE ===")
|
|
print(f"Beginner: {beginner}")
|
|
print(f"Intermediate: {intermediate}")
|
|
print(f"Advanced: {advanced}")
|
|
print(f"Total: {len(evaluations)}")
|
|
print("\nResults saved to proficiency_evaluations.json")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|