import json import re import time import urllib.request import urllib.error from typing import Optional # GitHub API (no auth needed for public repos, but rate limited) GITHUB_API = "https://api.github.com" def extract_github_info(url: str) -> tuple[Optional[str], Optional[str]]: """Extract owner and repo from GitHub URL.""" # Handle various GitHub URL formats patterns = [ r'github\.com/([^/]+)/([^/\s?#]+)', # github.com/owner/repo r'github\.com/([^/\s?#]+)/?$', # github.com/owner (profile) ] for pattern in patterns: match = re.search(pattern, url) if match: groups = match.groups() if len(groups) == 2: return groups[0], groups[1].rstrip('.git') elif len(groups) == 1: return groups[0], None return None, None def fetch_github_user(username: str) -> Optional[dict]: """Fetch GitHub user profile.""" url = f"{GITHUB_API}/users/{username}" req = urllib.request.Request(url) req.add_header("Accept", "application/vnd.github.v3+json") req.add_header("User-Agent", "Cohort-Evaluator") try: response = urllib.request.urlopen(req, timeout=10) return json.loads(response.read().decode()) except Exception as e: return None def fetch_github_repos(username: str) -> list: """Fetch user's public repos.""" url = f"{GITHUB_API}/users/{username}/repos?per_page=100&sort=updated" req = urllib.request.Request(url) req.add_header("Accept", "application/vnd.github.v3+json") req.add_header("User-Agent", "Cohort-Evaluator") try: response = urllib.request.urlopen(req, timeout=10) return json.loads(response.read().decode()) except Exception as e: return [] def fetch_repo_languages(owner: str, repo: str) -> dict: """Fetch languages used in a repo.""" url = f"{GITHUB_API}/repos/{owner}/{repo}/languages" req = urllib.request.Request(url) req.add_header("Accept", "application/vnd.github.v3+json") req.add_header("User-Agent", "Cohort-Evaluator") try: response = urllib.request.urlopen(req, timeout=10) return json.loads(response.read().decode()) except Exception as e: return {} def analyze_proficiency_text(text: str) -> tuple[str, list[str]]: """Analyze self-described proficiency text.""" text_lower = text.lower() # Extract languages/technologies mentioned tech_patterns = [ r'\b(python|java|javascript|typescript|c\+\+|c#|ruby|go|rust|swift|kotlin|php|perl|scala|r)\b', r'\b(react|angular|vue|node|express|django|flask|spring|rails|laravel)\b', r'\b(html|css|sass|scss|tailwind|bootstrap)\b', r'\b(sql|mysql|postgresql|mongodb|redis|firebase)\b', r'\b(docker|kubernetes|aws|azure|gcp|git)\b', r'\b(machine learning|ml|ai|data science|tensorflow|pytorch)\b', ] technologies = set() for pattern in tech_patterns: matches = re.findall(pattern, text_lower) technologies.update(matches) # Determine level from keywords beginner_keywords = ['beginner', 'learning', 'new to', 'just started', 'basic', 'novice', 'early'] intermediate_keywords = ['intermediate', 'comfortable', 'familiar', 'some experience', 'worked with'] advanced_keywords = ['advanced', 'expert', 'senior', 'professional', 'years of experience', 'proficient', 'strong'] level = 'intermediate' # default if any(kw in text_lower for kw in advanced_keywords): level = 'advanced' elif any(kw in text_lower for kw in beginner_keywords): level = 'beginner' elif any(kw in text_lower for kw in intermediate_keywords): level = 'intermediate' return level, list(technologies) def evaluate_applicant(applicant: dict, index: int, total: int) -> dict: """Evaluate a single applicant's technical proficiency.""" discord_id = applicant['discord_id'] project_url = applicant['project_url'] proficiency_self = applicant['proficiency_self'] project_reason = applicant['project_reason'] print(f"[{index+1}/{total}] Evaluating {discord_id}...") result = { 'discord_id': discord_id, 'github_username': None, 'github_repos_count': 0, 'github_followers': 0, 'languages_from_github': [], 'languages_from_text': [], 'self_described_level': None, 'final_proficiency': 'intermediate', # default 'tech_stack': [], 'notes': [] } # Analyze self-description text_level, text_techs = analyze_proficiency_text(proficiency_self + " " + project_reason) result['self_described_level'] = text_level result['languages_from_text'] = text_techs # Fetch GitHub data if URL provided if project_url and 'github.com' in project_url: owner, repo = extract_github_info(project_url) if owner: result['github_username'] = owner # Fetch user profile user_data = fetch_github_user(owner) if user_data: result['github_repos_count'] = user_data.get('public_repos', 0) result['github_followers'] = user_data.get('followers', 0) # Fetch repos to get languages repos = fetch_github_repos(owner) all_languages = set() for r in repos[:10]: # Check top 10 repos if r.get('language'): all_languages.add(r['language'].lower()) result['languages_from_github'] = list(all_languages) # If specific repo provided, get its languages if repo: repo_langs = fetch_repo_languages(owner, repo) for lang in repo_langs.keys(): all_languages.add(lang.lower()) result['languages_from_github'] = list(all_languages) time.sleep(0.5) # Rate limiting # Combine tech stack all_tech = set(result['languages_from_github']) | set(result['languages_from_text']) result['tech_stack'] = sorted(list(all_tech)) # Determine final proficiency # Factors: self-description, GitHub activity, tech diversity github_score = 0 if result['github_repos_count'] >= 20: github_score += 2 elif result['github_repos_count'] >= 10: github_score += 1 if result['github_followers'] >= 50: github_score += 2 elif result['github_followers'] >= 10: github_score += 1 tech_count = len(result['tech_stack']) if tech_count >= 6: github_score += 2 elif tech_count >= 3: github_score += 1 # Map self-described level to score level_scores = {'beginner': 0, 'intermediate': 2, 'advanced': 4} self_score = level_scores.get(text_level, 2) # Combined score total_score = github_score + self_score if total_score >= 7: result['final_proficiency'] = 'advanced' elif total_score >= 3: result['final_proficiency'] = 'intermediate' else: result['final_proficiency'] = 'beginner' # Add notes if not project_url or 'github.com' not in project_url: result['notes'].append('No GitHub URL provided') if result['github_repos_count'] == 0 and result['github_username']: result['notes'].append('GitHub profile has no public repos') return result def main(): # Load applicants with open('applicants_to_evaluate.json', 'r') as f: applicants = json.load(f) print(f"Evaluating {len(applicants)} applicants...\n") evaluations = [] for i, applicant in enumerate(applicants): result = evaluate_applicant(applicant, i, len(applicants)) evaluations.append(result) # Progress update every 10 if (i + 1) % 10 == 0: print(f" Progress: {i+1}/{len(applicants)} complete") # Save results with open('proficiency_evaluations.json', 'w') as f: json.dump(evaluations, f, indent=2) # Summary beginner = sum(1 for e in evaluations if e['final_proficiency'] == 'beginner') intermediate = sum(1 for e in evaluations if e['final_proficiency'] == 'intermediate') advanced = sum(1 for e in evaluations if e['final_proficiency'] == 'advanced') print(f"\n=== EVALUATION COMPLETE ===") print(f"Beginner: {beginner}") print(f"Intermediate: {intermediate}") print(f"Advanced: {advanced}") print(f"Total: {len(evaluations)}") print(f"\nResults saved to proficiency_evaluations.json") if __name__ == "__main__": main()