Spaces:

richard-to
/

mesop-jeopardy-live

Running

mesop-jeopardy-live / question_bank.py

Richard

Add custom clue generation

28ab31e about 1 month ago

4.26 kB

	import json
	import os
	import re
	from collections import defaultdict

	from models import Clue


	QuestionSet = list[Clue]

	_DEFAULT_JEOPARDY_DATASET_PATH = "data/jeopardy.json"
	_NUM_QUESTIONS_PER_CATEGORY = 5


	def load() -> list[QuestionSet]:
	"""Loads a cleaned up data set to use in Mesop Jeopardy game."""
	data = _load_raw_data()
	data = _add_raw_value(data)
	data = _clean_questions(data)
	question_sets = _group_into_question_sets(data)
	question_sets = _sort_question_sets(question_sets)
	question_sets = _normalize_values(question_sets)
	return _filter_out_incomplete_question_sets(question_sets)


	def _load_raw_data() -> QuestionSet:
	"""Load the raw data set.

	Format of each question/clue looks like this:

	{
	"category": "HISTORY",
	"air_date": "2004-12-31",
	"question": "'For the last 8 years of his life, Galileo was...",
	"value": "$200",
	"answer": "Copernicus",
	"round": "Jeopardy!",
	"show_number": "4680"
	}
	"""
	file_path = os.getenv("JEOPARDY_DATASET_PATH", _DEFAULT_JEOPARDY_DATASET_PATH)
	with open(file_path, "r") as f:
	return [Clue(**row) for row in json.load(f)]


	def _add_raw_value(data: QuestionSet) -> QuestionSet:
	"""Add raw value since the value is formatted as a dollar string that isn't as easy
	to sort"""
	for row in data:
	row.raw_value = _convert_dollar_amount(row.value)
	return data


	def _clean_questions(data: QuestionSet) -> QuestionSet:
	"""Clean up questions

	- Strip single quotes around each question
	- Replace escaped single quotes
	- Strip HTML tags
	"""
	for row in data:
	row.question = re.sub("<[^<]+?>", "", row.question.strip("'").replace("\\'", "'"))
	return data


	def _convert_dollar_amount(value: str \| None) -> int:
	"""Coverts raw value into an integer.

	The raw value is string formatted as a dollar amount, such as $1,000. In this
	dataset the dollar amount is not given for Daily Doubles that were not answered, so
	we'll set those cases to a value of 0 for now.

	In addition, answered daily doubles will have odd dollar amounts.

	These values won't be used in the actually game. Only for roughly sorting the
	question difficulty.
	"""
	if value:
	return int(value.replace("$", "").replace(",", ""))
	else:
	return 0


	def _group_into_question_sets(data: QuestionSet) -> list[QuestionSet]:
	"""Groups the questions by category for that air date.

	We want to mix and match questions across games, but we want to keep the questions
	within a category together.
	"""
	question_sets = defaultdict(lambda: [])
	for row in data:
	question_sets[(row.category, row.air_date)].append(row)
	return list(question_sets.values())


	def _sort_question_sets(question_sets: list[QuestionSet]) -> list[QuestionSet]:
	return [_sort_question_set(question_set) for question_set in question_sets]


	def _sort_question_set(question_set: QuestionSet) -> QuestionSet:
	"""Sort the question sets so they are ordered roughly in order difficulty.

	This will not always be true due to Daily Doubles skewing the order. The data set
	did not store the Daily Double values separately from the normal game value.
	"""
	return sorted(question_set, key=lambda q: q.raw_value)


	def _normalize_values(question_sets: list[QuestionSet]) -> list[QuestionSet]:
	"""Normalizes question dollar amounts based on order of appearance.

	Since we picking random categories across different rounds and years, the dollar
	values will differ. So we will normalize them here.
	"""
	for question_set in question_sets:
	for index, question in enumerate(question_set):
	question.normalized_value = (index + 1) * 200
	return question_sets


	def _filter_out_incomplete_question_sets(question_sets: list[QuestionSet]) -> list[QuestionSet]:
	"""Filters out question sets that are incomplete (do not contain five questions).

	Final Jeopardy categories only have one question so we want to ignore those.
	We also want to avoid anomalies in the data set.

	In addition there are cases where not all questions were answered for a category. This
	means that we will be missing a question on the board.
	"""
	return [
	question_set
	for question_set in question_sets
	if len(question_set) == _NUM_QUESTIONS_PER_CATEGORY
	]