mesop-jeopardy-live / question_bank.py
Richard
Add custom clue generation
28ab31e
raw
history blame
4.26 kB
import json
import os
import re
from collections import defaultdict
from models import Clue
QuestionSet = list[Clue]
_DEFAULT_JEOPARDY_DATASET_PATH = "data/jeopardy.json"
_NUM_QUESTIONS_PER_CATEGORY = 5
def load() -> list[QuestionSet]:
"""Loads a cleaned up data set to use in Mesop Jeopardy game."""
data = _load_raw_data()
data = _add_raw_value(data)
data = _clean_questions(data)
question_sets = _group_into_question_sets(data)
question_sets = _sort_question_sets(question_sets)
question_sets = _normalize_values(question_sets)
return _filter_out_incomplete_question_sets(question_sets)
def _load_raw_data() -> QuestionSet:
"""Load the raw data set.
Format of each question/clue looks like this:
{
"category": "HISTORY",
"air_date": "2004-12-31",
"question": "'For the last 8 years of his life, Galileo was...",
"value": "$200",
"answer": "Copernicus",
"round": "Jeopardy!",
"show_number": "4680"
}
"""
file_path = os.getenv("JEOPARDY_DATASET_PATH", _DEFAULT_JEOPARDY_DATASET_PATH)
with open(file_path, "r") as f:
return [Clue(**row) for row in json.load(f)]
def _add_raw_value(data: QuestionSet) -> QuestionSet:
"""Add raw value since the value is formatted as a dollar string that isn't as easy
to sort"""
for row in data:
row.raw_value = _convert_dollar_amount(row.value)
return data
def _clean_questions(data: QuestionSet) -> QuestionSet:
"""Clean up questions
- Strip single quotes around each question
- Replace escaped single quotes
- Strip HTML tags
"""
for row in data:
row.question = re.sub("<[^<]+?>", "", row.question.strip("'").replace("\\'", "'"))
return data
def _convert_dollar_amount(value: str | None) -> int:
"""Coverts raw value into an integer.
The raw value is string formatted as a dollar amount, such as $1,000. In this
dataset the dollar amount is not given for Daily Doubles that were not answered, so
we'll set those cases to a value of 0 for now.
In addition, answered daily doubles will have odd dollar amounts.
These values won't be used in the actually game. Only for roughly sorting the
question difficulty.
"""
if value:
return int(value.replace("$", "").replace(",", ""))
else:
return 0
def _group_into_question_sets(data: QuestionSet) -> list[QuestionSet]:
"""Groups the questions by category for that air date.
We want to mix and match questions across games, but we want to keep the questions
within a category together.
"""
question_sets = defaultdict(lambda: [])
for row in data:
question_sets[(row.category, row.air_date)].append(row)
return list(question_sets.values())
def _sort_question_sets(question_sets: list[QuestionSet]) -> list[QuestionSet]:
return [_sort_question_set(question_set) for question_set in question_sets]
def _sort_question_set(question_set: QuestionSet) -> QuestionSet:
"""Sort the question sets so they are ordered roughly in order difficulty.
This will not always be true due to Daily Doubles skewing the order. The data set
did not store the Daily Double values separately from the normal game value.
"""
return sorted(question_set, key=lambda q: q.raw_value)
def _normalize_values(question_sets: list[QuestionSet]) -> list[QuestionSet]:
"""Normalizes question dollar amounts based on order of appearance.
Since we picking random categories across different rounds and years, the dollar
values will differ. So we will normalize them here.
"""
for question_set in question_sets:
for index, question in enumerate(question_set):
question.normalized_value = (index + 1) * 200
return question_sets
def _filter_out_incomplete_question_sets(question_sets: list[QuestionSet]) -> list[QuestionSet]:
"""Filters out question sets that are incomplete (do not contain five questions).
Final Jeopardy categories only have one question so we want to ignore those.
We also want to avoid anomalies in the data set.
In addition there are cases where not all questions were answered for a category. This
means that we will be missing a question on the board.
"""
return [
question_set
for question_set in question_sets
if len(question_set) == _NUM_QUESTIONS_PER_CATEGORY
]