Files
ephemere/python/cohort/evaluate_technical_proficiency.py
T
naomi a40188413a docs: add data file documentation and fix data path resolution
All Python cohort scripts now use DATA_DIR = Path(__file__).parent.parent.parent / "data"
to correctly resolve the repo-root data/ directory regardless of the working directory
set by run.sh. All TypeScript scripts have expanded JSDoc headers documenting data file
requirements and environment variables.
2026-02-23 15:42:03 -08:00

291 lines
9.2 KiB
Python

"""Evaluate the technical proficiency of cohort applicants using their GitHub profiles.
Fetches each applicant's public GitHub repositories and scores their proficiency as
Beginner, Intermediate, or Advanced based on language variety, repo count, commit
activity, and presence of certain technologies.
Data files (place in data/):
- applicants_to_evaluate.json List of applicants with GitHub usernames
Outputs (written to data/):
- proficiency_evaluations.json Proficiency scores and tech stacks per applicant
Env vars:
- None (uses public GitHub API; may be rate-limited without authentication)
"""
import json
import re
import time
import urllib.error
import urllib.request
from pathlib import Path
DATA_DIR = Path(__file__).parent.parent.parent / "data"
# GitHub API (no auth needed for public repos, but rate limited)
GITHUB_API = "https://api.github.com"
def extract_github_info(url: str) -> tuple[str | None, str | None]:
"""Extract owner and repo from GitHub URL."""
# Handle various GitHub URL formats
patterns = [
r"github\.com/([^/]+)/([^/\s?#]+)", # github.com/owner/repo
r"github\.com/([^/\s?#]+)/?$", # github.com/owner (profile)
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
groups = match.groups()
if len(groups) == 2:
return groups[0], groups[1].rstrip(".git")
elif len(groups) == 1:
return groups[0], None
return None, None
def fetch_github_user(username: str) -> dict | None:
"""Fetch GitHub user profile."""
url = f"{GITHUB_API}/users/{username}"
req = urllib.request.Request(url)
req.add_header("Accept", "application/vnd.github.v3+json")
req.add_header("User-Agent", "Cohort-Evaluator")
try:
response = urllib.request.urlopen(req, timeout=10)
return json.loads(response.read().decode())
except Exception:
return None
def fetch_github_repos(username: str) -> list:
"""Fetch user's public repos."""
url = f"{GITHUB_API}/users/{username}/repos?per_page=100&sort=updated"
req = urllib.request.Request(url)
req.add_header("Accept", "application/vnd.github.v3+json")
req.add_header("User-Agent", "Cohort-Evaluator")
try:
response = urllib.request.urlopen(req, timeout=10)
return json.loads(response.read().decode())
except Exception:
return []
def fetch_repo_languages(owner: str, repo: str) -> dict:
"""Fetch languages used in a repo."""
url = f"{GITHUB_API}/repos/{owner}/{repo}/languages"
req = urllib.request.Request(url)
req.add_header("Accept", "application/vnd.github.v3+json")
req.add_header("User-Agent", "Cohort-Evaluator")
try:
response = urllib.request.urlopen(req, timeout=10)
return json.loads(response.read().decode())
except Exception:
return {}
def analyze_proficiency_text(text: str) -> tuple[str, list[str]]:
"""Analyze self-described proficiency text."""
text_lower = text.lower()
# Extract languages/technologies mentioned
tech_patterns = [
r"\b(python|java|javascript|typescript|c\+\+|c#|ruby|go|rust|swift|kotlin|php|perl|scala|r)\b",
r"\b(react|angular|vue|node|express|django|flask|spring|rails|laravel)\b",
r"\b(html|css|sass|scss|tailwind|bootstrap)\b",
r"\b(sql|mysql|postgresql|mongodb|redis|firebase)\b",
r"\b(docker|kubernetes|aws|azure|gcp|git)\b",
r"\b(machine learning|ml|ai|data science|tensorflow|pytorch)\b",
]
technologies = set()
for pattern in tech_patterns:
matches = re.findall(pattern, text_lower)
technologies.update(matches)
# Determine level from keywords
beginner_keywords = [
"beginner",
"learning",
"new to",
"just started",
"basic",
"novice",
"early",
]
intermediate_keywords = [
"intermediate",
"comfortable",
"familiar",
"some experience",
"worked with",
]
advanced_keywords = [
"advanced",
"expert",
"senior",
"professional",
"years of experience",
"proficient",
"strong",
]
level = "intermediate" # default
if any(kw in text_lower for kw in advanced_keywords):
level = "advanced"
elif any(kw in text_lower for kw in beginner_keywords):
level = "beginner"
elif any(kw in text_lower for kw in intermediate_keywords):
level = "intermediate"
return level, list(technologies)
def evaluate_applicant(applicant: dict, index: int, total: int) -> dict:
"""Evaluate a single applicant's technical proficiency."""
discord_id = applicant["discord_id"]
project_url = applicant["project_url"]
proficiency_self = applicant["proficiency_self"]
project_reason = applicant["project_reason"]
print(f"[{index + 1}/{total}] Evaluating {discord_id}...")
result = {
"discord_id": discord_id,
"github_username": None,
"github_repos_count": 0,
"github_followers": 0,
"languages_from_github": [],
"languages_from_text": [],
"self_described_level": None,
"final_proficiency": "intermediate", # default
"tech_stack": [],
"notes": [],
}
# Analyze self-description
text_level, text_techs = analyze_proficiency_text(
proficiency_self + " " + project_reason
)
result["self_described_level"] = text_level
result["languages_from_text"] = text_techs
# Fetch GitHub data if URL provided
if project_url and "github.com" in project_url:
owner, repo = extract_github_info(project_url)
if owner:
result["github_username"] = owner
# Fetch user profile
user_data = fetch_github_user(owner)
if user_data:
result["github_repos_count"] = user_data.get("public_repos", 0)
result["github_followers"] = user_data.get("followers", 0)
# Fetch repos to get languages
repos = fetch_github_repos(owner)
all_languages = set()
for r in repos[:10]: # Check top 10 repos
if r.get("language"):
all_languages.add(r["language"].lower())
result["languages_from_github"] = list(all_languages)
# If specific repo provided, get its languages
if repo:
repo_langs = fetch_repo_languages(owner, repo)
for lang in repo_langs:
all_languages.add(lang.lower())
result["languages_from_github"] = list(all_languages)
time.sleep(0.5) # Rate limiting
# Combine tech stack
all_tech = set(result["languages_from_github"]) | set(result["languages_from_text"])
result["tech_stack"] = sorted(all_tech)
# Determine final proficiency
# Factors: self-description, GitHub activity, tech diversity
github_score = 0
if result["github_repos_count"] >= 20:
github_score += 2
elif result["github_repos_count"] >= 10:
github_score += 1
if result["github_followers"] >= 50:
github_score += 2
elif result["github_followers"] >= 10:
github_score += 1
tech_count = len(result["tech_stack"])
if tech_count >= 6:
github_score += 2
elif tech_count >= 3:
github_score += 1
# Map self-described level to score
level_scores = {"beginner": 0, "intermediate": 2, "advanced": 4}
self_score = level_scores.get(text_level, 2)
# Combined score
total_score = github_score + self_score
if total_score >= 7:
result["final_proficiency"] = "advanced"
elif total_score >= 3:
result["final_proficiency"] = "intermediate"
else:
result["final_proficiency"] = "beginner"
# Add notes
if not project_url or "github.com" not in project_url:
result["notes"].append("No GitHub URL provided")
if result["github_repos_count"] == 0 and result["github_username"]:
result["notes"].append("GitHub profile has no public repos")
return result
def main():
# Load applicants
with open(DATA_DIR / "applicants_to_evaluate.json") as f:
applicants = json.load(f)
print(f"Evaluating {len(applicants)} applicants...\n")
evaluations = []
for i, applicant in enumerate(applicants):
result = evaluate_applicant(applicant, i, len(applicants))
evaluations.append(result)
# Progress update every 10
if (i + 1) % 10 == 0:
print(f" Progress: {i + 1}/{len(applicants)} complete")
# Save results
with open(DATA_DIR / "proficiency_evaluations.json", "w") as f:
json.dump(evaluations, f, indent=2)
# Summary
beginner = sum(1 for e in evaluations if e["final_proficiency"] == "beginner")
intermediate = sum(
1 for e in evaluations if e["final_proficiency"] == "intermediate"
)
advanced = sum(1 for e in evaluations if e["final_proficiency"] == "advanced")
print("\n=== EVALUATION COMPLETE ===")
print(f"Beginner: {beginner}")
print(f"Intermediate: {intermediate}")
print(f"Advanced: {advanced}")
print(f"Total: {len(evaluations)}")
print("\nResults saved to proficiency_evaluations.json")
if __name__ == "__main__":
main()