Spaces:
Running
Running
import json | |
import os | |
import re | |
from collections import defaultdict | |
from models import Clue | |
QuestionSet = list[Clue] | |
_DEFAULT_JEOPARDY_DATASET_PATH = "data/jeopardy.json" | |
_NUM_QUESTIONS_PER_CATEGORY = 5 | |
def load() -> list[QuestionSet]: | |
"""Loads a cleaned up data set to use in Mesop Jeopardy game.""" | |
data = _load_raw_data() | |
data = _add_raw_value(data) | |
data = _clean_questions(data) | |
question_sets = _group_into_question_sets(data) | |
question_sets = _sort_question_sets(question_sets) | |
question_sets = _normalize_values(question_sets) | |
return _filter_out_incomplete_question_sets(question_sets) | |
def _load_raw_data() -> QuestionSet: | |
"""Load the raw data set. | |
Format of each question/clue looks like this: | |
{ | |
"category": "HISTORY", | |
"air_date": "2004-12-31", | |
"question": "'For the last 8 years of his life, Galileo was...", | |
"value": "$200", | |
"answer": "Copernicus", | |
"round": "Jeopardy!", | |
"show_number": "4680" | |
} | |
""" | |
file_path = os.getenv("JEOPARDY_DATASET_PATH", _DEFAULT_JEOPARDY_DATASET_PATH) | |
with open(file_path, "r") as f: | |
return [Clue(**row) for row in json.load(f)] | |
def _add_raw_value(data: QuestionSet) -> QuestionSet: | |
"""Add raw value since the value is formatted as a dollar string that isn't as easy | |
to sort""" | |
for row in data: | |
row.raw_value = _convert_dollar_amount(row.value) | |
return data | |
def _clean_questions(data: QuestionSet) -> QuestionSet: | |
"""Clean up questions | |
- Strip single quotes around each question | |
- Replace escaped single quotes | |
- Strip HTML tags | |
""" | |
for row in data: | |
row.question = re.sub("<[^<]+?>", "", row.question.strip("'").replace("\\'", "'")) | |
return data | |
def _convert_dollar_amount(value: str | None) -> int: | |
"""Coverts raw value into an integer. | |
The raw value is string formatted as a dollar amount, such as $1,000. In this | |
dataset the dollar amount is not given for Daily Doubles that were not answered, so | |
we'll set those cases to a value of 0 for now. | |
In addition, answered daily doubles will have odd dollar amounts. | |
These values won't be used in the actually game. Only for roughly sorting the | |
question difficulty. | |
""" | |
if value: | |
return int(value.replace("$", "").replace(",", "")) | |
else: | |
return 0 | |
def _group_into_question_sets(data: QuestionSet) -> list[QuestionSet]: | |
"""Groups the questions by category for that air date. | |
We want to mix and match questions across games, but we want to keep the questions | |
within a category together. | |
""" | |
question_sets = defaultdict(lambda: []) | |
for row in data: | |
question_sets[(row.category, row.air_date)].append(row) | |
return list(question_sets.values()) | |
def _sort_question_sets(question_sets: list[QuestionSet]) -> list[QuestionSet]: | |
return [_sort_question_set(question_set) for question_set in question_sets] | |
def _sort_question_set(question_set: QuestionSet) -> QuestionSet: | |
"""Sort the question sets so they are ordered roughly in order difficulty. | |
This will not always be true due to Daily Doubles skewing the order. The data set | |
did not store the Daily Double values separately from the normal game value. | |
""" | |
return sorted(question_set, key=lambda q: q.raw_value) | |
def _normalize_values(question_sets: list[QuestionSet]) -> list[QuestionSet]: | |
"""Normalizes question dollar amounts based on order of appearance. | |
Since we picking random categories across different rounds and years, the dollar | |
values will differ. So we will normalize them here. | |
""" | |
for question_set in question_sets: | |
for index, question in enumerate(question_set): | |
question.normalized_value = (index + 1) * 200 | |
return question_sets | |
def _filter_out_incomplete_question_sets(question_sets: list[QuestionSet]) -> list[QuestionSet]: | |
"""Filters out question sets that are incomplete (do not contain five questions). | |
Final Jeopardy categories only have one question so we want to ignore those. | |
We also want to avoid anomalies in the data set. | |
In addition there are cases where not all questions were answered for a category. This | |
means that we will be missing a question on the board. | |
""" | |
return [ | |
question_set | |
for question_set in question_sets | |
if len(question_set) == _NUM_QUESTIONS_PER_CATEGORY | |
] | |