generated from nhcarrigan/template
ec58c9c843
CI / dependency-pin-check-typescript (push) Successful in 5s
CI / dependency-pin-check-python (push) Successful in 4s
CI / python (push) Successful in 9m28s
CI / typescript (push) Successful in 9m42s
Security Scan and Upload / Security & DefectDojo Upload (push) Successful in 1m39s
## Summary This PR completes the bash script restructuring and adds comprehensive documentation across all script categories. ### Bash Restructuring - Moved cohort shell scripts (`remove_github_members.sh`, `update_github_teams.sh`) from `python/cohort/` into a new `bash/cohort/` directory - Moved existing bash utilities (`add-keys-to-git.sh`, `fix-yubikey-perms.sh`, `list-yubikey-ssh-keys.sh`) into a new `bash/yubikey/` subdirectory - Updated `run.sh` to support **Bash** as a third language option alongside TypeScript and Python - Bash scripts are run directly (no 1Password secret injection needed) - Category discovery and script listing works the same as for TS/Python - Removed dead "Root Scripts" logic that was no longer needed ### Documentation Added `README.md` files for all script categories that were missing them: - `bash/cohort/README.md` — cohort GitHub team management scripts - `bash/yubikey/README.md` — YubiKey SSH key and permission utilities - `typescript/src/crowdin/README.md` — Crowdin translation management scripts - `typescript/src/discord/README.md` — Discord bot utility scripts - `typescript/src/discourse/README.md` — Discourse forum management scripts - `typescript/src/gitea/README.md` — Gitea bulk repository operation scripts - `typescript/src/github/README.md` — GitHub API interaction scripts - `typescript/src/music/README.md` — Music file metadata tools - `typescript/src/s3/README.md` — S3-compatible object storage scripts - `typescript/src/security/README.md` — Security analysis and reporting scripts - `python/cohort/README.md` — Updated to remove moved shell scripts, fix usage commands Also updated project-level docs: - **`README.md`** — Corrected project structure, fixed running instructions (removed references to non-existent `make run-ts`/`make run-py` targets), added Bash prerequisites - **`CLAUDE.md`** — Updated project overview, structure, development standards, and script-adding guides to reflect the current state of the project ✨ This PR was created with help from Hikari~ 🌸 Co-authored-by: Naomi Carrigan <commits@nhcarrigan.com> Reviewed-on: #6 Co-authored-by: Hikari <hikari@nhcarrigan.com> Co-committed-by: Hikari <hikari@nhcarrigan.com>
291 lines
9.2 KiB
Python
291 lines
9.2 KiB
Python
"""Evaluate the technical proficiency of cohort applicants using their GitHub profiles.
|
|
|
|
Fetches each applicant's public GitHub repositories and scores their proficiency as
|
|
Beginner, Intermediate, or Advanced based on language variety, repo count, commit
|
|
activity, and presence of certain technologies.
|
|
|
|
Data files (place in data/):
|
|
- applicants_to_evaluate.json List of applicants with GitHub usernames
|
|
|
|
Outputs (written to data/):
|
|
- proficiency_evaluations.json Proficiency scores and tech stacks per applicant
|
|
|
|
Env vars:
|
|
- None (uses public GitHub API; may be rate-limited without authentication)
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
import urllib.error
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
|
|
|
# GitHub API (no auth needed for public repos, but rate limited)
|
|
GITHUB_API = "https://api.github.com"
|
|
|
|
|
|
def extract_github_info(url: str) -> tuple[str | None, str | None]:
|
|
"""Extract owner and repo from GitHub URL."""
|
|
# Handle various GitHub URL formats
|
|
patterns = [
|
|
r"github\.com/([^/]+)/([^/\s?#]+)", # github.com/owner/repo
|
|
r"github\.com/([^/\s?#]+)/?$", # github.com/owner (profile)
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
groups = match.groups()
|
|
if len(groups) == 2:
|
|
return groups[0], groups[1].rstrip(".git")
|
|
elif len(groups) == 1:
|
|
return groups[0], None
|
|
return None, None
|
|
|
|
|
|
def fetch_github_user(username: str) -> dict | None:
|
|
"""Fetch GitHub user profile."""
|
|
url = f"{GITHUB_API}/users/{username}"
|
|
req = urllib.request.Request(url)
|
|
req.add_header("Accept", "application/vnd.github.v3+json")
|
|
req.add_header("User-Agent", "Cohort-Evaluator")
|
|
|
|
try:
|
|
response = urllib.request.urlopen(req, timeout=10)
|
|
return json.loads(response.read().decode())
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def fetch_github_repos(username: str) -> list:
|
|
"""Fetch user's public repos."""
|
|
url = f"{GITHUB_API}/users/{username}/repos?per_page=100&sort=updated"
|
|
req = urllib.request.Request(url)
|
|
req.add_header("Accept", "application/vnd.github.v3+json")
|
|
req.add_header("User-Agent", "Cohort-Evaluator")
|
|
|
|
try:
|
|
response = urllib.request.urlopen(req, timeout=10)
|
|
return json.loads(response.read().decode())
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def fetch_repo_languages(owner: str, repo: str) -> dict:
|
|
"""Fetch languages used in a repo."""
|
|
url = f"{GITHUB_API}/repos/{owner}/{repo}/languages"
|
|
req = urllib.request.Request(url)
|
|
req.add_header("Accept", "application/vnd.github.v3+json")
|
|
req.add_header("User-Agent", "Cohort-Evaluator")
|
|
|
|
try:
|
|
response = urllib.request.urlopen(req, timeout=10)
|
|
return json.loads(response.read().decode())
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
def analyze_proficiency_text(text: str) -> tuple[str, list[str]]:
|
|
"""Analyze self-described proficiency text."""
|
|
text_lower = text.lower()
|
|
|
|
# Extract languages/technologies mentioned
|
|
tech_patterns = [
|
|
r"\b(python|java|javascript|typescript|c\+\+|c#|ruby|go|rust|swift|kotlin|php|perl|scala|r)\b",
|
|
r"\b(react|angular|vue|node|express|django|flask|spring|rails|laravel)\b",
|
|
r"\b(html|css|sass|scss|tailwind|bootstrap)\b",
|
|
r"\b(sql|mysql|postgresql|mongodb|redis|firebase)\b",
|
|
r"\b(docker|kubernetes|aws|azure|gcp|git)\b",
|
|
r"\b(machine learning|ml|ai|data science|tensorflow|pytorch)\b",
|
|
]
|
|
|
|
technologies = set()
|
|
for pattern in tech_patterns:
|
|
matches = re.findall(pattern, text_lower)
|
|
technologies.update(matches)
|
|
|
|
# Determine level from keywords
|
|
beginner_keywords = [
|
|
"beginner",
|
|
"learning",
|
|
"new to",
|
|
"just started",
|
|
"basic",
|
|
"novice",
|
|
"early",
|
|
]
|
|
intermediate_keywords = [
|
|
"intermediate",
|
|
"comfortable",
|
|
"familiar",
|
|
"some experience",
|
|
"worked with",
|
|
]
|
|
advanced_keywords = [
|
|
"advanced",
|
|
"expert",
|
|
"senior",
|
|
"professional",
|
|
"years of experience",
|
|
"proficient",
|
|
"strong",
|
|
]
|
|
|
|
level = "intermediate" # default
|
|
|
|
if any(kw in text_lower for kw in advanced_keywords):
|
|
level = "advanced"
|
|
elif any(kw in text_lower for kw in beginner_keywords):
|
|
level = "beginner"
|
|
elif any(kw in text_lower for kw in intermediate_keywords):
|
|
level = "intermediate"
|
|
|
|
return level, list(technologies)
|
|
|
|
|
|
def evaluate_applicant(applicant: dict, index: int, total: int) -> dict:
|
|
"""Evaluate a single applicant's technical proficiency."""
|
|
discord_id = applicant["discord_id"]
|
|
project_url = applicant["project_url"]
|
|
proficiency_self = applicant["proficiency_self"]
|
|
project_reason = applicant["project_reason"]
|
|
|
|
print(f"[{index + 1}/{total}] Evaluating {discord_id}...")
|
|
|
|
result = {
|
|
"discord_id": discord_id,
|
|
"github_username": None,
|
|
"github_repos_count": 0,
|
|
"github_followers": 0,
|
|
"languages_from_github": [],
|
|
"languages_from_text": [],
|
|
"self_described_level": None,
|
|
"final_proficiency": "intermediate", # default
|
|
"tech_stack": [],
|
|
"notes": [],
|
|
}
|
|
|
|
# Analyze self-description
|
|
text_level, text_techs = analyze_proficiency_text(
|
|
proficiency_self + " " + project_reason
|
|
)
|
|
result["self_described_level"] = text_level
|
|
result["languages_from_text"] = text_techs
|
|
|
|
# Fetch GitHub data if URL provided
|
|
if project_url and "github.com" in project_url:
|
|
owner, repo = extract_github_info(project_url)
|
|
|
|
if owner:
|
|
result["github_username"] = owner
|
|
|
|
# Fetch user profile
|
|
user_data = fetch_github_user(owner)
|
|
if user_data:
|
|
result["github_repos_count"] = user_data.get("public_repos", 0)
|
|
result["github_followers"] = user_data.get("followers", 0)
|
|
|
|
# Fetch repos to get languages
|
|
repos = fetch_github_repos(owner)
|
|
all_languages = set()
|
|
for r in repos[:10]: # Check top 10 repos
|
|
if r.get("language"):
|
|
all_languages.add(r["language"].lower())
|
|
result["languages_from_github"] = list(all_languages)
|
|
|
|
# If specific repo provided, get its languages
|
|
if repo:
|
|
repo_langs = fetch_repo_languages(owner, repo)
|
|
for lang in repo_langs:
|
|
all_languages.add(lang.lower())
|
|
result["languages_from_github"] = list(all_languages)
|
|
|
|
time.sleep(0.5) # Rate limiting
|
|
|
|
# Combine tech stack
|
|
all_tech = set(result["languages_from_github"]) | set(result["languages_from_text"])
|
|
result["tech_stack"] = sorted(all_tech)
|
|
|
|
# Determine final proficiency
|
|
# Factors: self-description, GitHub activity, tech diversity
|
|
github_score = 0
|
|
if result["github_repos_count"] >= 20:
|
|
github_score += 2
|
|
elif result["github_repos_count"] >= 10:
|
|
github_score += 1
|
|
|
|
if result["github_followers"] >= 50:
|
|
github_score += 2
|
|
elif result["github_followers"] >= 10:
|
|
github_score += 1
|
|
|
|
tech_count = len(result["tech_stack"])
|
|
if tech_count >= 6:
|
|
github_score += 2
|
|
elif tech_count >= 3:
|
|
github_score += 1
|
|
|
|
# Map self-described level to score
|
|
level_scores = {"beginner": 0, "intermediate": 2, "advanced": 4}
|
|
self_score = level_scores.get(text_level, 2)
|
|
|
|
# Combined score
|
|
total_score = github_score + self_score
|
|
|
|
if total_score >= 7:
|
|
result["final_proficiency"] = "advanced"
|
|
elif total_score >= 3:
|
|
result["final_proficiency"] = "intermediate"
|
|
else:
|
|
result["final_proficiency"] = "beginner"
|
|
|
|
# Add notes
|
|
if not project_url or "github.com" not in project_url:
|
|
result["notes"].append("No GitHub URL provided")
|
|
if result["github_repos_count"] == 0 and result["github_username"]:
|
|
result["notes"].append("GitHub profile has no public repos")
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
# Load applicants
|
|
with open(DATA_DIR / "applicants_to_evaluate.json") as f:
|
|
applicants = json.load(f)
|
|
|
|
print(f"Evaluating {len(applicants)} applicants...\n")
|
|
|
|
evaluations = []
|
|
for i, applicant in enumerate(applicants):
|
|
result = evaluate_applicant(applicant, i, len(applicants))
|
|
evaluations.append(result)
|
|
|
|
# Progress update every 10
|
|
if (i + 1) % 10 == 0:
|
|
print(f" Progress: {i + 1}/{len(applicants)} complete")
|
|
|
|
# Save results
|
|
with open(DATA_DIR / "proficiency_evaluations.json", "w") as f:
|
|
json.dump(evaluations, f, indent=2)
|
|
|
|
# Summary
|
|
beginner = sum(1 for e in evaluations if e["final_proficiency"] == "beginner")
|
|
intermediate = sum(
|
|
1 for e in evaluations if e["final_proficiency"] == "intermediate"
|
|
)
|
|
advanced = sum(1 for e in evaluations if e["final_proficiency"] == "advanced")
|
|
|
|
print("\n=== EVALUATION COMPLETE ===")
|
|
print(f"Beginner: {beginner}")
|
|
print(f"Intermediate: {intermediate}")
|
|
print(f"Advanced: {advanced}")
|
|
print(f"Total: {len(evaluations)}")
|
|
print("\nResults saved to proficiency_evaluations.json")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|