feat: add multilingual support so Naomi can use Python too

2026-01-23 15:32:02 -08:00
parent 38e7f15d93
commit c0ad74367a
52 changed files with 1305 additions and 46 deletions
@@ -0,0 +1,182 @@
+import json
+import re
+from collections import defaultdict
+
+DAYS = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
+
+UTC_BLOCKS = {
+    'mornings': (6, 12),    # 06:00 - 12:00 UTC
+    'afternoons': (12, 18), # 12:00 - 18:00 UTC
+    'evenings': (18, 24),   # 18:00 - 00:00 UTC
+    'nights': (0, 6)        # 00:00 - 06:00 UTC
+}
+
+def parse_utc_offset(timezone_str: str) -> float:
+    """Extract UTC offset from timezone string like 'America/New_York (UTC-5)'"""
+    match = re.search(r'UTC([+-]?\d+(?::\d+)?)', timezone_str)
+    if match:
+        offset_str = match.group(1)
+        if ':' in offset_str:
+            parts = offset_str.split(':')
+            hours = int(parts[0])
+            minutes = int(parts[1]) if len(parts) > 1 else 0
+            if hours < 0:
+                return hours - minutes / 60
+            return hours + minutes / 60
+        return float(offset_str)
+    return 0
+
+def parse_time_slots(time_str: str) -> list[tuple[int, int]]:
+    """Parse time slots like '17:00-18:00' or '07:00-08:00; 19:00-20:00'"""
+    slots = []
+    if not time_str or time_str.lower() in ['n/a', 'na', '']:
+        return slots
+
+    parts = time_str.split(';')
+    for part in parts:
+        part = part.strip()
+        match = re.search(r'(\d{1,2}):(\d{2})\s*-\s*(\d{1,2}):(\d{2})', part)
+        if match:
+            start_hour = int(match.group(1))
+            end_hour = int(match.group(3))
+            slots.append((start_hour, end_hour))
+    return slots
+
+def local_hour_to_utc(local_hour: int, utc_offset: float) -> int:
+    """Convert local hour to UTC hour"""
+    utc_hour = local_hour - utc_offset
+    return int(utc_hour) % 24
+
+def get_utc_blocks_for_hour(utc_hour: int) -> list[str]:
+    """Determine which UTC block(s) an hour falls into"""
+    blocks = []
+    for block_name, (start, end) in UTC_BLOCKS.items():
+        if block_name == 'nights':
+            if utc_hour >= 0 and utc_hour < 6:
+                blocks.append(block_name)
+        elif block_name == 'evenings':
+            if utc_hour >= 18 and utc_hour < 24:
+                blocks.append(block_name)
+        else:
+            if utc_hour >= start and utc_hour < end:
+                blocks.append(block_name)
+    return blocks
+
+def analyze_applicant_availability(timezone_str: str, day_slots: dict) -> dict:
+    """Analyze availability for one applicant"""
+    utc_offset = parse_utc_offset(timezone_str)
+
+    block_counts = defaultdict(int)
+    all_utc_hours = set()
+
+    for day in DAYS:
+        slots = day_slots.get(day, [])
+        for start_hour, end_hour in slots:
+            for hour in range(start_hour, end_hour):
+                utc_hour = local_hour_to_utc(hour, utc_offset)
+                all_utc_hours.add(utc_hour)
+                blocks = get_utc_blocks_for_hour(utc_hour)
+                for block in blocks:
+                    block_counts[block] += 1
+
+    available_blocks = []
+    for block in ['mornings', 'afternoons', 'evenings', 'nights']:
+        if block_counts[block] >= 3:
+            available_blocks.append(block)
+
+    return {
+        'utc_offset': utc_offset,
+        'timezone': timezone_str,
+        'available_blocks': available_blocks,
+        'block_counts': dict(block_counts),
+        'total_unique_utc_hours': len(all_utc_hours)
+    }
+
+def parse_table_md() -> list[dict]:
+    """Parse table.md and extract availability data"""
+    with open('table.md', 'r') as f:
+        content = f.read()
+
+    lines = content.strip().split('\n')
+
+    header_idx = None
+    for i, line in enumerate(lines):
+        if line.startswith('| Discord ID'):
+            header_idx = i
+            break
+
+    if header_idx is None:
+        raise ValueError("Could not find table header")
+
+    headers = [h.strip() for h in lines[header_idx].split('|')[1:-1]]
+
+    applicants = []
+    for line in lines[header_idx + 2:]:
+        if not line.startswith('|'):
+            continue
+
+        cells = [c.strip() for c in line.split('|')[1:-1]]
+        if len(cells) < len(headers):
+            continue
+
+        row = dict(zip(headers, cells))
+        applicants.append(row)
+
+    return applicants
+
+def main():
+    with open('discord_verification.json', 'r') as f:
+        verification = json.load(f)
+
+    verified_ids = set(v[0] for v in verification['verified'])
+    print(f"Verified applicants: {len(verified_ids)}")
+
+    applicants = parse_table_md()
+    print(f"Total applicants in table: {len(applicants)}")
+
+    availability_results = []
+
+    for applicant in applicants:
+        discord_id = applicant.get('Discord ID', '')
+        if discord_id not in verified_ids:
+            continue
+
+        timezone = applicant.get('Timezone', '')
+
+        day_slots = {}
+        for day in DAYS:
+            time_str = applicant.get(day, '')
+            day_slots[day] = parse_time_slots(time_str)
+
+        analysis = analyze_applicant_availability(timezone, day_slots)
+
+        availability_results.append({
+            'discord_id': discord_id,
+            'timezone': timezone,
+            'utc_offset': analysis['utc_offset'],
+            'available_blocks': analysis['available_blocks'],
+            'block_counts': analysis['block_counts'],
+            'total_unique_utc_hours': analysis['total_unique_utc_hours']
+        })
+
+    with open('availability_analysis.json', 'w') as f:
+        json.dump(availability_results, f, indent=2)
+
+    block_distribution = defaultdict(int)
+    for result in availability_results:
+        for block in result['available_blocks']:
+            block_distribution[block] += 1
+
+    print(f"\n=== AVAILABILITY ANALYSIS COMPLETE ===")
+    print(f"Analyzed: {len(availability_results)} applicants")
+    print(f"\nBlock Distribution (applicants available in each block):")
+    for block in ['mornings', 'afternoons', 'evenings', 'nights']:
+        print(f"  {block.capitalize()}: {block_distribution[block]}")
+
+    no_blocks = sum(1 for r in availability_results if not r['available_blocks'])
+    print(f"\nApplicants with no clear block availability: {no_blocks}")
+
+    print(f"\nResults saved to availability_analysis.json")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,238 @@
+import json
+import re
+import time
+import urllib.request
+import urllib.error
+from typing import Optional
+
+# GitHub API (no auth needed for public repos, but rate limited)
+GITHUB_API = "https://api.github.com"
+
+def extract_github_info(url: str) -> tuple[Optional[str], Optional[str]]:
+    """Extract owner and repo from GitHub URL."""
+    # Handle various GitHub URL formats
+    patterns = [
+        r'github\.com/([^/]+)/([^/\s?#]+)',  # github.com/owner/repo
+        r'github\.com/([^/\s?#]+)/?$',  # github.com/owner (profile)
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            groups = match.groups()
+            if len(groups) == 2:
+                return groups[0], groups[1].rstrip('.git')
+            elif len(groups) == 1:
+                return groups[0], None
+    return None, None
+
+def fetch_github_user(username: str) -> Optional[dict]:
+    """Fetch GitHub user profile."""
+    url = f"{GITHUB_API}/users/{username}"
+    req = urllib.request.Request(url)
+    req.add_header("Accept", "application/vnd.github.v3+json")
+    req.add_header("User-Agent", "Cohort-Evaluator")
+
+    try:
+        response = urllib.request.urlopen(req, timeout=10)
+        return json.loads(response.read().decode())
+    except Exception as e:
+        return None
+
+def fetch_github_repos(username: str) -> list:
+    """Fetch user's public repos."""
+    url = f"{GITHUB_API}/users/{username}/repos?per_page=100&sort=updated"
+    req = urllib.request.Request(url)
+    req.add_header("Accept", "application/vnd.github.v3+json")
+    req.add_header("User-Agent", "Cohort-Evaluator")
+
+    try:
+        response = urllib.request.urlopen(req, timeout=10)
+        return json.loads(response.read().decode())
+    except Exception as e:
+        return []
+
+def fetch_repo_languages(owner: str, repo: str) -> dict:
+    """Fetch languages used in a repo."""
+    url = f"{GITHUB_API}/repos/{owner}/{repo}/languages"
+    req = urllib.request.Request(url)
+    req.add_header("Accept", "application/vnd.github.v3+json")
+    req.add_header("User-Agent", "Cohort-Evaluator")
+
+    try:
+        response = urllib.request.urlopen(req, timeout=10)
+        return json.loads(response.read().decode())
+    except Exception as e:
+        return {}
+
+def analyze_proficiency_text(text: str) -> tuple[str, list[str]]:
+    """Analyze self-described proficiency text."""
+    text_lower = text.lower()
+
+    # Extract languages/technologies mentioned
+    tech_patterns = [
+        r'\b(python|java|javascript|typescript|c\+\+|c#|ruby|go|rust|swift|kotlin|php|perl|scala|r)\b',
+        r'\b(react|angular|vue|node|express|django|flask|spring|rails|laravel)\b',
+        r'\b(html|css|sass|scss|tailwind|bootstrap)\b',
+        r'\b(sql|mysql|postgresql|mongodb|redis|firebase)\b',
+        r'\b(docker|kubernetes|aws|azure|gcp|git)\b',
+        r'\b(machine learning|ml|ai|data science|tensorflow|pytorch)\b',
+    ]
+
+    technologies = set()
+    for pattern in tech_patterns:
+        matches = re.findall(pattern, text_lower)
+        technologies.update(matches)
+
+    # Determine level from keywords
+    beginner_keywords = ['beginner', 'learning', 'new to', 'just started', 'basic', 'novice', 'early']
+    intermediate_keywords = ['intermediate', 'comfortable', 'familiar', 'some experience', 'worked with']
+    advanced_keywords = ['advanced', 'expert', 'senior', 'professional', 'years of experience', 'proficient', 'strong']
+
+    level = 'intermediate'  # default
+
+    if any(kw in text_lower for kw in advanced_keywords):
+        level = 'advanced'
+    elif any(kw in text_lower for kw in beginner_keywords):
+        level = 'beginner'
+    elif any(kw in text_lower for kw in intermediate_keywords):
+        level = 'intermediate'
+
+    return level, list(technologies)
+
+def evaluate_applicant(applicant: dict, index: int, total: int) -> dict:
+    """Evaluate a single applicant's technical proficiency."""
+    discord_id = applicant['discord_id']
+    project_url = applicant['project_url']
+    proficiency_self = applicant['proficiency_self']
+    project_reason = applicant['project_reason']
+
+    print(f"[{index+1}/{total}] Evaluating {discord_id}...")
+
+    result = {
+        'discord_id': discord_id,
+        'github_username': None,
+        'github_repos_count': 0,
+        'github_followers': 0,
+        'languages_from_github': [],
+        'languages_from_text': [],
+        'self_described_level': None,
+        'final_proficiency': 'intermediate',  # default
+        'tech_stack': [],
+        'notes': []
+    }
+
+    # Analyze self-description
+    text_level, text_techs = analyze_proficiency_text(proficiency_self + " " + project_reason)
+    result['self_described_level'] = text_level
+    result['languages_from_text'] = text_techs
+
+    # Fetch GitHub data if URL provided
+    if project_url and 'github.com' in project_url:
+        owner, repo = extract_github_info(project_url)
+
+        if owner:
+            result['github_username'] = owner
+
+            # Fetch user profile
+            user_data = fetch_github_user(owner)
+            if user_data:
+                result['github_repos_count'] = user_data.get('public_repos', 0)
+                result['github_followers'] = user_data.get('followers', 0)
+
+            # Fetch repos to get languages
+            repos = fetch_github_repos(owner)
+            all_languages = set()
+            for r in repos[:10]:  # Check top 10 repos
+                if r.get('language'):
+                    all_languages.add(r['language'].lower())
+            result['languages_from_github'] = list(all_languages)
+
+            # If specific repo provided, get its languages
+            if repo:
+                repo_langs = fetch_repo_languages(owner, repo)
+                for lang in repo_langs.keys():
+                    all_languages.add(lang.lower())
+                result['languages_from_github'] = list(all_languages)
+
+            time.sleep(0.5)  # Rate limiting
+
+    # Combine tech stack
+    all_tech = set(result['languages_from_github']) | set(result['languages_from_text'])
+    result['tech_stack'] = sorted(list(all_tech))
+
+    # Determine final proficiency
+    # Factors: self-description, GitHub activity, tech diversity
+    github_score = 0
+    if result['github_repos_count'] >= 20:
+        github_score += 2
+    elif result['github_repos_count'] >= 10:
+        github_score += 1
+
+    if result['github_followers'] >= 50:
+        github_score += 2
+    elif result['github_followers'] >= 10:
+        github_score += 1
+
+    tech_count = len(result['tech_stack'])
+    if tech_count >= 6:
+        github_score += 2
+    elif tech_count >= 3:
+        github_score += 1
+
+    # Map self-described level to score
+    level_scores = {'beginner': 0, 'intermediate': 2, 'advanced': 4}
+    self_score = level_scores.get(text_level, 2)
+
+    # Combined score
+    total_score = github_score + self_score
+
+    if total_score >= 7:
+        result['final_proficiency'] = 'advanced'
+    elif total_score >= 3:
+        result['final_proficiency'] = 'intermediate'
+    else:
+        result['final_proficiency'] = 'beginner'
+
+    # Add notes
+    if not project_url or 'github.com' not in project_url:
+        result['notes'].append('No GitHub URL provided')
+    if result['github_repos_count'] == 0 and result['github_username']:
+        result['notes'].append('GitHub profile has no public repos')
+
+    return result
+
+def main():
+    # Load applicants
+    with open('applicants_to_evaluate.json', 'r') as f:
+        applicants = json.load(f)
+
+    print(f"Evaluating {len(applicants)} applicants...\n")
+
+    evaluations = []
+    for i, applicant in enumerate(applicants):
+        result = evaluate_applicant(applicant, i, len(applicants))
+        evaluations.append(result)
+
+        # Progress update every 10
+        if (i + 1) % 10 == 0:
+            print(f"  Progress: {i+1}/{len(applicants)} complete")
+
+    # Save results
+    with open('proficiency_evaluations.json', 'w') as f:
+        json.dump(evaluations, f, indent=2)
+
+    # Summary
+    beginner = sum(1 for e in evaluations if e['final_proficiency'] == 'beginner')
+    intermediate = sum(1 for e in evaluations if e['final_proficiency'] == 'intermediate')
+    advanced = sum(1 for e in evaluations if e['final_proficiency'] == 'advanced')
+
+    print(f"\n=== EVALUATION COMPLETE ===")
+    print(f"Beginner: {beginner}")
+    print(f"Intermediate: {intermediate}")
+    print(f"Advanced: {advanced}")
+    print(f"Total: {len(evaluations)}")
+    print(f"\nResults saved to proficiency_evaluations.json")
+
+if __name__ == "__main__":
+    main()