Spaces:
Running
Running
Richard
commited on
Commit
·
28ab31e
1
Parent(s):
297dca5
Add custom clue generation
Browse files- README.md +22 -1
- question_bank.py +4 -2
- sample_data/custom_jeopardy.json +0 -0
- scripts/categories.txt +85 -0
- scripts/generate_clues.py +295 -0
README.md
CHANGED
@@ -58,6 +58,28 @@ like this
|
|
58 |
}
|
59 |
```
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
## Screenshots
|
62 |
|
63 |
Here are some screenshots of the UI.
|
@@ -69,4 +91,3 @@ Here are some screenshots of the UI.
|
|
69 |
### Jeopardy answer
|
70 |
|
71 |
<img width="1312" alt="Screenshot 2025-01-26 at 2 04 05 PM" src="https://github.com/user-attachments/assets/97d704e1-6df6-4a05-8a77-8e91363295fa" />
|
72 |
-
|
|
|
58 |
}
|
59 |
```
|
60 |
|
61 |
+
### LLM Generated questions
|
62 |
+
|
63 |
+
You can also use LLMs to generate a question set using `scripts/generate_clues.py`. It
|
64 |
+
will generate a file called `sample_data/custom_jeopardy.json`.
|
65 |
+
|
66 |
+
Here is a basic example using some LLM-generated categories that I copy and pasted into
|
67 |
+
`scripts/categories.txt`. The script is a bit slow since it generates questions for one
|
68 |
+
category at a time and throttles itself to 15 RPM to stay within the limits of Gemini
|
69 |
+
1.5 Flash. A future improvement could be to generate batches of questions.
|
70 |
+
|
71 |
+
```
|
72 |
+
cd scripts
|
73 |
+
python generate_clues.py --file categories.txt --dataset ../data/custom_jeopardy.json --overwrite
|
74 |
+
```
|
75 |
+
|
76 |
+
If you're using a custom dataset, you can an environment variable to specify the
|
77 |
+
location of the file.
|
78 |
+
|
79 |
+
```
|
80 |
+
JEOPARDY_DATASET_PATH=data/custom_jeopardy.json
|
81 |
+
```
|
82 |
+
|
83 |
## Screenshots
|
84 |
|
85 |
Here are some screenshots of the UI.
|
|
|
91 |
### Jeopardy answer
|
92 |
|
93 |
<img width="1312" alt="Screenshot 2025-01-26 at 2 04 05 PM" src="https://github.com/user-attachments/assets/97d704e1-6df6-4a05-8a77-8e91363295fa" />
|
|
question_bank.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import json
|
|
|
2 |
import re
|
3 |
from collections import defaultdict
|
4 |
|
@@ -7,7 +8,7 @@ from models import Clue
|
|
7 |
|
8 |
QuestionSet = list[Clue]
|
9 |
|
10 |
-
|
11 |
_NUM_QUESTIONS_PER_CATEGORY = 5
|
12 |
|
13 |
|
@@ -37,7 +38,8 @@ def _load_raw_data() -> QuestionSet:
|
|
37 |
"show_number": "4680"
|
38 |
}
|
39 |
"""
|
40 |
-
|
|
|
41 |
return [Clue(**row) for row in json.load(f)]
|
42 |
|
43 |
|
|
|
1 |
import json
|
2 |
+
import os
|
3 |
import re
|
4 |
from collections import defaultdict
|
5 |
|
|
|
8 |
|
9 |
QuestionSet = list[Clue]
|
10 |
|
11 |
+
_DEFAULT_JEOPARDY_DATASET_PATH = "data/jeopardy.json"
|
12 |
_NUM_QUESTIONS_PER_CATEGORY = 5
|
13 |
|
14 |
|
|
|
38 |
"show_number": "4680"
|
39 |
}
|
40 |
"""
|
41 |
+
file_path = os.getenv("JEOPARDY_DATASET_PATH", _DEFAULT_JEOPARDY_DATASET_PATH)
|
42 |
+
with open(file_path, "r") as f:
|
43 |
return [Clue(**row) for row in json.load(f)]
|
44 |
|
45 |
|
sample_data/custom_jeopardy.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
scripts/categories.txt
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Ancient Mysteries
|
2 |
+
Science Fiction in Reality
|
3 |
+
Word Origins
|
4 |
+
Famous Last Words
|
5 |
+
Musical Revolutionaries
|
6 |
+
World Cuisine
|
7 |
+
Hidden History
|
8 |
+
Nature's Superlatives
|
9 |
+
Inventions That Changed Everything
|
10 |
+
Literary Villains
|
11 |
+
Art Scandals
|
12 |
+
Mythology Around the World
|
13 |
+
Space Race Moments
|
14 |
+
Secret Languages
|
15 |
+
Historical What-Ifs
|
16 |
+
Extreme Weather
|
17 |
+
Archaeological Discoveries
|
18 |
+
Game-Changing Algorithms
|
19 |
+
Lost Cities
|
20 |
+
Scientific Breakthroughs
|
21 |
+
"B" Prepared
|
22 |
+
Rhyme Time
|
23 |
+
Before & After
|
24 |
+
Let It "Snow"
|
25 |
+
The "State" of Things
|
26 |
+
Recurring "Current" Events
|
27 |
+
"Tree"gonometry
|
28 |
+
A "Capital" Idea
|
29 |
+
"Novel" Beginnings
|
30 |
+
"Rock" of Ages
|
31 |
+
Out "Standing" in Their Fields
|
32 |
+
Initial "Here"
|
33 |
+
"Meow"velous Cats
|
34 |
+
"Chair"man of the Board
|
35 |
+
"Water" You Talking About
|
36 |
+
"Time" After Time
|
37 |
+
"Star"ting Lineup
|
38 |
+
"Food" For Thought
|
39 |
+
"Paint" By Numbers
|
40 |
+
"Sound" Investment
|
41 |
+
International Authors
|
42 |
+
First Lines
|
43 |
+
"Book" To The Future
|
44 |
+
Shakespeare's Women
|
45 |
+
Modern Poetry
|
46 |
+
Before & Chapter
|
47 |
+
Fantasy Worlds
|
48 |
+
Literary Adaptations
|
49 |
+
"Novel" Beginnings
|
50 |
+
Banned Books
|
51 |
+
Olympic Firsts
|
52 |
+
Sports "Ball" of Fame
|
53 |
+
Baseball Records
|
54 |
+
Championship Moments
|
55 |
+
"Ring" Leaders
|
56 |
+
Sports Science
|
57 |
+
World Cup Heroes
|
58 |
+
"Court" Proceedings
|
59 |
+
Sports Dynasties
|
60 |
+
Winter Olympics
|
61 |
+
NFL Milestones
|
62 |
+
"Field" of Dreams
|
63 |
+
Sports Nicknames
|
64 |
+
Tennis Legends
|
65 |
+
"Draft" Picks
|
66 |
+
Tech Pioneers
|
67 |
+
"Byte" Size Facts
|
68 |
+
Social Media History
|
69 |
+
"Computer" Language Arts
|
70 |
+
Silicon Valley Stories
|
71 |
+
"Net" Worth
|
72 |
+
Gaming Milestones
|
73 |
+
"Cyber" Space
|
74 |
+
AI Breakthroughs
|
75 |
+
Digital "Keys" to Success
|
76 |
+
Tech Fails
|
77 |
+
"Data" Day Problems
|
78 |
+
Start-Up Stories
|
79 |
+
"Cloud" Cover
|
80 |
+
Internet Firsts
|
81 |
+
"Code" Red
|
82 |
+
Mobile Moments
|
83 |
+
Hardware Headlines
|
84 |
+
"Web" of Intrigue
|
85 |
+
Digital Privacy
|
scripts/generate_clues.py
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import typing
|
4 |
+
import time
|
5 |
+
from datetime import datetime
|
6 |
+
import random
|
7 |
+
import argparse
|
8 |
+
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
import google.generativeai as genai
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
# Flash 1.5 has a requestion limit of 15 RPM
|
15 |
+
GENERATE_DELAY = 5
|
16 |
+
DEFAULT_JEOPARDY_DATA = "../data/custom_jeopardy.json"
|
17 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
18 |
+
|
19 |
+
# Global variable to store the current dataset path
|
20 |
+
current_dataset_path = DEFAULT_JEOPARDY_DATA
|
21 |
+
|
22 |
+
genai.configure(api_key=GOOGLE_API_KEY)
|
23 |
+
|
24 |
+
|
25 |
+
class JeopardyQuestion(typing.TypedDict):
|
26 |
+
question: str
|
27 |
+
answer: str
|
28 |
+
value: str
|
29 |
+
|
30 |
+
|
31 |
+
question_gen_model = genai.GenerativeModel(
|
32 |
+
"gemini-1.5-flash",
|
33 |
+
generation_config=genai.GenerationConfig(
|
34 |
+
temperature=1,
|
35 |
+
top_p=0.95,
|
36 |
+
top_k=64,
|
37 |
+
response_mime_type="application/json",
|
38 |
+
response_schema=list[JeopardyQuestion],
|
39 |
+
),
|
40 |
+
)
|
41 |
+
|
42 |
+
|
43 |
+
_JEOPARDY_QUESTION_GENERATE_PROMPT = """
|
44 |
+
You are a Jeopardy! expert who specializes in crafting great questions.
|
45 |
+
|
46 |
+
Generate Jeopardy! questions for the following category: {category}.
|
47 |
+
|
48 |
+
A Jeopardy! category has 5 questions of increasing difficulty. The values are $200,
|
49 |
+
$400, $600, $800, $1000.
|
50 |
+
""".strip()
|
51 |
+
|
52 |
+
|
53 |
+
def get_existing_categories() -> set[str]:
|
54 |
+
"""Get a set of all existing categories in the dataset."""
|
55 |
+
try:
|
56 |
+
with open(current_dataset_path, "r") as f:
|
57 |
+
data = json.load(f)
|
58 |
+
return {item["category"].lower() for item in data}
|
59 |
+
except FileNotFoundError:
|
60 |
+
return set()
|
61 |
+
except json.JSONDecodeError:
|
62 |
+
print(f"Warning: Error reading {current_dataset_path}. Treating as empty file.")
|
63 |
+
return set()
|
64 |
+
|
65 |
+
|
66 |
+
def read_custom_jeopardy_questions_dataset():
|
67 |
+
try:
|
68 |
+
with open(current_dataset_path, "r") as f:
|
69 |
+
return json.load(f)
|
70 |
+
except FileNotFoundError:
|
71 |
+
return []
|
72 |
+
|
73 |
+
|
74 |
+
def write_custom_jeopardy_questions_dataset(data, overwrite=False):
|
75 |
+
"""Write questions to the dataset file.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
data: List of questions to write
|
79 |
+
overwrite: If True, replace existing data. If False, append to existing data.
|
80 |
+
"""
|
81 |
+
# Create directory if it doesn't exist
|
82 |
+
os.makedirs(os.path.dirname(current_dataset_path), exist_ok=True)
|
83 |
+
|
84 |
+
if overwrite:
|
85 |
+
# In overwrite mode, simply write the new data
|
86 |
+
with open(current_dataset_path, "w") as f:
|
87 |
+
json.dump(data, f, indent=2)
|
88 |
+
return
|
89 |
+
|
90 |
+
# In append mode
|
91 |
+
if not os.path.exists(current_dataset_path):
|
92 |
+
# If file doesn't exist, create it with the new data
|
93 |
+
with open(current_dataset_path, "w") as f:
|
94 |
+
json.dump(data, f, indent=2)
|
95 |
+
return
|
96 |
+
|
97 |
+
try:
|
98 |
+
# Read existing data
|
99 |
+
with open(current_dataset_path, "r") as f:
|
100 |
+
existing_data = json.load(f)
|
101 |
+
|
102 |
+
# Append new data
|
103 |
+
existing_data.extend(data)
|
104 |
+
|
105 |
+
# Write combined data
|
106 |
+
with open(current_dataset_path, "w") as f:
|
107 |
+
json.dump(existing_data, f, indent=2)
|
108 |
+
except json.JSONDecodeError:
|
109 |
+
# If file is empty or invalid, just write new data
|
110 |
+
with open(current_dataset_path, "w") as f:
|
111 |
+
json.dump(data, f, indent=2)
|
112 |
+
|
113 |
+
|
114 |
+
def generate_questions_by_category(category) -> list[dict[str, str]]:
|
115 |
+
"""Generate Jeopardy questions for a category using Gemini.
|
116 |
+
|
117 |
+
Returns:
|
118 |
+
Generated jeopardy data set in the expected format.
|
119 |
+
"""
|
120 |
+
time.sleep(5)
|
121 |
+
questions = json.loads(
|
122 |
+
question_gen_model.generate_content(
|
123 |
+
_JEOPARDY_QUESTION_GENERATE_PROMPT.format(category=category)
|
124 |
+
).text
|
125 |
+
)
|
126 |
+
questions_list = []
|
127 |
+
air_date = datetime.now().strftime("%Y-%m-%d")
|
128 |
+
show_number = str(random.randint(1, 2000))
|
129 |
+
# Format the questions like the data set.
|
130 |
+
for question in questions:
|
131 |
+
questions_list.append(
|
132 |
+
{
|
133 |
+
"question": question["question"],
|
134 |
+
"answer": question["answer"],
|
135 |
+
"value": question["value"],
|
136 |
+
"category": category,
|
137 |
+
"air_date": air_date,
|
138 |
+
"show_number": show_number,
|
139 |
+
"round": "Jeopardy!",
|
140 |
+
}
|
141 |
+
)
|
142 |
+
return questions_list
|
143 |
+
|
144 |
+
|
145 |
+
def print_questions(questions: list[dict[str, str]], category: str):
|
146 |
+
"""Print the generated questions in a readable format."""
|
147 |
+
print(f"\nCategory: {category}\n")
|
148 |
+
print("-" * 50)
|
149 |
+
|
150 |
+
# Sort questions by value
|
151 |
+
sorted_questions = sorted(
|
152 |
+
questions, key=lambda x: int(x["value"].replace("$", "").replace(",", ""))
|
153 |
+
)
|
154 |
+
|
155 |
+
for q in sorted_questions:
|
156 |
+
print(f"Value: {q['value']}")
|
157 |
+
print(f"Question: {q['question']}")
|
158 |
+
print(f"Answer: {q['answer']}")
|
159 |
+
print("-" * 50)
|
160 |
+
|
161 |
+
|
162 |
+
def get_categories_from_input() -> list[str]:
|
163 |
+
"""Get multiple categories from user input."""
|
164 |
+
categories = []
|
165 |
+
print("Enter categories (one per line). Press Enter twice when done:")
|
166 |
+
|
167 |
+
while True:
|
168 |
+
category = input().strip()
|
169 |
+
if not category:
|
170 |
+
break
|
171 |
+
categories.append(category)
|
172 |
+
|
173 |
+
return categories
|
174 |
+
|
175 |
+
|
176 |
+
def read_categories_from_file(filename: str) -> list[str]:
|
177 |
+
"""Read categories from a text file, one category per line."""
|
178 |
+
try:
|
179 |
+
with open(filename, "r") as f:
|
180 |
+
# Read lines and remove empty lines and whitespace
|
181 |
+
categories = [line.strip() for line in f if line.strip()]
|
182 |
+
return categories
|
183 |
+
except FileNotFoundError:
|
184 |
+
print(f"Error: File '{filename}' not found")
|
185 |
+
return []
|
186 |
+
except Exception as e:
|
187 |
+
print(f"Error reading file: {e}")
|
188 |
+
return []
|
189 |
+
|
190 |
+
|
191 |
+
def filter_existing_categories(categories: list[str]) -> list[str]:
|
192 |
+
"""Filter out categories that already exist in the dataset."""
|
193 |
+
existing_categories = get_existing_categories()
|
194 |
+
new_categories = []
|
195 |
+
skipped_categories = []
|
196 |
+
|
197 |
+
for category in categories:
|
198 |
+
if category.lower() in existing_categories:
|
199 |
+
skipped_categories.append(category)
|
200 |
+
else:
|
201 |
+
new_categories.append(category)
|
202 |
+
|
203 |
+
if skipped_categories:
|
204 |
+
print("\nSkipping the following existing categories:")
|
205 |
+
for category in skipped_categories:
|
206 |
+
print(f"- {category}")
|
207 |
+
|
208 |
+
return new_categories
|
209 |
+
|
210 |
+
|
211 |
+
def main():
|
212 |
+
parser = argparse.ArgumentParser(
|
213 |
+
description="Generate Jeopardy questions for multiple categories"
|
214 |
+
)
|
215 |
+
group = parser.add_mutually_exclusive_group()
|
216 |
+
group.add_argument(
|
217 |
+
"--categories", nargs="+", type=str, help="List of Jeopardy categories to generate questions"
|
218 |
+
)
|
219 |
+
group.add_argument(
|
220 |
+
"--file", type=str, help="Path to text file containing categories (one per line)"
|
221 |
+
)
|
222 |
+
parser.add_argument(
|
223 |
+
"--overwrite", action="store_true", help="Overwrite existing questions instead of appending"
|
224 |
+
)
|
225 |
+
parser.add_argument(
|
226 |
+
"--dataset", type=str, help=f"Path to the dataset file (default: {DEFAULT_JEOPARDY_DATA})"
|
227 |
+
)
|
228 |
+
args = parser.parse_args()
|
229 |
+
|
230 |
+
# Set the dataset path
|
231 |
+
global current_dataset_path
|
232 |
+
if args.dataset:
|
233 |
+
current_dataset_path = args.dataset
|
234 |
+
print(f"Using custom dataset path: {current_dataset_path}")
|
235 |
+
|
236 |
+
# Determine which source to use for categories
|
237 |
+
if args.file:
|
238 |
+
categories = read_categories_from_file(args.file)
|
239 |
+
if not categories:
|
240 |
+
print("No valid categories found in file. Exiting.")
|
241 |
+
return
|
242 |
+
elif args.categories:
|
243 |
+
categories = args.categories
|
244 |
+
else:
|
245 |
+
categories = get_categories_from_input()
|
246 |
+
|
247 |
+
if not categories:
|
248 |
+
print("No categories provided. Exiting.")
|
249 |
+
return
|
250 |
+
|
251 |
+
# Filter out existing categories
|
252 |
+
if not args.overwrite:
|
253 |
+
categories = filter_existing_categories(categories)
|
254 |
+
if not categories:
|
255 |
+
print("\nAll categories already exist in the dataset. Nothing to do.")
|
256 |
+
return
|
257 |
+
|
258 |
+
print(f"\nPreparing to generate questions for {len(categories)} categories:")
|
259 |
+
for i, category in enumerate(categories, 1):
|
260 |
+
print(f"{i}. {category}")
|
261 |
+
print()
|
262 |
+
|
263 |
+
if args.overwrite:
|
264 |
+
print("Warning: This will overwrite all existing questions!")
|
265 |
+
confirm = input("Do you want to continue? (y/N): ")
|
266 |
+
if confirm.lower() != "y":
|
267 |
+
print("Operation cancelled.")
|
268 |
+
return
|
269 |
+
|
270 |
+
# If overwrite mode, initialize empty dataset
|
271 |
+
if args.overwrite:
|
272 |
+
write_custom_jeopardy_questions_dataset([], overwrite=True)
|
273 |
+
print("Initialized empty dataset for overwrite mode")
|
274 |
+
|
275 |
+
for i, category in enumerate(categories, 1):
|
276 |
+
print(f"\nGenerating questions for category: {category} ({i}/{len(categories)})")
|
277 |
+
try:
|
278 |
+
questions = generate_questions_by_category(category)
|
279 |
+
print_questions(questions, category)
|
280 |
+
|
281 |
+
# Save after each category
|
282 |
+
write_custom_jeopardy_questions_dataset(questions, overwrite=False)
|
283 |
+
print(f"✓ Saved questions for {category}")
|
284 |
+
|
285 |
+
except Exception as e:
|
286 |
+
print(f"Error generating questions for {category}: {str(e)}")
|
287 |
+
print("Skipping to next category...")
|
288 |
+
continue
|
289 |
+
|
290 |
+
print(f"\nCompleted processing {len(categories)} categories.")
|
291 |
+
print(f"All generated questions have been saved to {current_dataset_path}")
|
292 |
+
|
293 |
+
|
294 |
+
if __name__ == "__main__":
|
295 |
+
main()
|