Spaces:
Sleeping
Sleeping
File size: 3,774 Bytes
3b598e6 fb03edc 324d83a a49fb5a 8e8067f 3b598e6 1aaf3fc 3b598e6 3ae1921 3b598e6 e12b285 3b598e6 e12b285 3b598e6 e12b285 3b598e6 e12b285 3d8833c 8e8067f 3b598e6 8e8067f 3b598e6 fb03edc 760a998 e12b285 a49fb5a 3b598e6 e12b285 3b598e6 1aaf3fc 3b598e6 a49fb5a 3b598e6 a49fb5a 3b598e6 1aaf3fc 3b598e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
from concurrent.futures import ThreadPoolExecutor, as_completed
from tests.candidate import complete_interview
from tests.grader import grade
import random
import logging
from typing import List, Dict, Any, Tuple
# Constants
INTERVIEW_TYPES = ["ml_design", "math", "ml_theory", "system_design", "sql", "coding"]
EDGE_CASE_MODES = ["empty", "gibberish", "repeat"]
MIN_AVERAGE_SCORE = 0.7
MIN_INTERVIEW_SCORE = 0.2
MAX_WORKERS = 5
def complete_and_grade_interview(interview_type: str, mode: str = "normal") -> Dict[str, Any]:
"""
Complete an interview and return the overall score and metadata.
Args:
interview_type (str): Type of the interview.
mode (str): Mode of the interview ("normal", "empty", "gibberish", "repeat").
Returns:
Dict[str, Any]: Dictionary containing interview metadata and score.
Raises:
AssertionError: If the overall score is below the minimum score.
"""
file_path, _ = complete_interview(interview_type, "test", model="gpt-4o-mini", mode=mode)
feedback = grade(file_path, model="gpt-4o")
score = feedback["overall_score"]
assert (
score > MIN_INTERVIEW_SCORE
), f"Score {score} is below minimum {MIN_INTERVIEW_SCORE} for {interview_type} interview in {mode} mode"
return {"interview_type": interview_type, "mode": mode, "score": score}
def test_simulate_interview() -> None:
"""
Test the complete interview process for various interview types, including edge cases.
Runs interviews concurrently using a thread pool and checks the average score.
"""
interview_configs: List[Tuple[str, str]] = [(it, "normal") for it in INTERVIEW_TYPES] + [
(random.choice(INTERVIEW_TYPES), mode) for mode in EDGE_CASE_MODES
]
valid_results: List[Dict[str, Any]] = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
future_to_config = {
executor.submit(complete_and_grade_interview, interview_type, mode): (interview_type, mode)
for interview_type, mode in interview_configs
}
for future in as_completed(future_to_config):
interview_type, mode = future_to_config[future]
try:
result = future.result()
valid_results.append(result)
logging.info(f"Interview completed - Type: {result['interview_type']}, Mode: {result['mode']}, Score: {result['score']}")
except Exception as e:
logging.error(f"Interview failed - Type: {interview_type}, Mode: {mode}, Error: {str(e)}")
# Calculate and log average score
average_score = sum(result["score"] for result in valid_results) / len(valid_results)
logging.info(f"Average score across all interviews: {average_score:.2f}")
# Assert on the average score
assert average_score > MIN_AVERAGE_SCORE, f"Average score {average_score:.2f} is below minimum {MIN_AVERAGE_SCORE}"
# Log summary of results
for interview_type in INTERVIEW_TYPES:
type_scores = [r["score"] for r in valid_results if r["interview_type"] == interview_type]
if type_scores:
avg_type_score = sum(type_scores) / len(type_scores)
logging.info(f"Average score for {interview_type}: {avg_type_score:.2f}")
# Check that we have results for all interview types and edge cases
tested_types = {r["interview_type"] for r in valid_results}
tested_modes = {r["mode"] for r in valid_results}
assert tested_types == set(INTERVIEW_TYPES), f"Not all interview types were tested. Missing: {set(INTERVIEW_TYPES) - tested_types}"
assert tested_modes == set(
EDGE_CASE_MODES + ["normal"]
), f"Not all modes were tested. Missing: {set(EDGE_CASE_MODES + ['normal']) - tested_modes}"
|