Spaces:
Runtime error
Runtime error
Tidied up generate_data.py
Browse files
src/data_synthesis/generate_data.py
CHANGED
@@ -25,7 +25,7 @@ class Review:
|
|
25 |
|
26 |
class Product:
|
27 |
"""
|
28 |
-
Simple representation of a
|
29 |
"""
|
30 |
def __init__(self, category: str, name: str, description: str, price: float, features: List[str], reviews: List[Review]):
|
31 |
self.category = category
|
@@ -38,18 +38,16 @@ class Product:
|
|
38 |
|
39 |
class DataPrompt:
|
40 |
"""
|
41 |
-
|
42 |
"""
|
43 |
@staticmethod
|
44 |
def prompt_setup() -> str:
|
45 |
return "You are a marketing assistant for consumer home electronics manufacturer ElectroHome. You are polite and succinct.\n\n"
|
46 |
|
47 |
-
|
48 |
@staticmethod
|
49 |
def prompt_setup_user() -> str:
|
50 |
return "You are a customer of consumer home electronics manufacturer ElectroHome, and are reviewing a product you have purchased and used.\n\n"
|
51 |
|
52 |
-
|
53 |
@staticmethod
|
54 |
def products_for_category(category: str, features: List[str], k: int) -> str:
|
55 |
existing_products = product_names_for_category(category)
|
@@ -101,7 +99,7 @@ Please format the response as json in this style:
|
|
101 |
|
102 |
def generate_products(category: str, features: List[str], k: int = 20):
|
103 |
"""
|
104 |
-
|
105 |
"""
|
106 |
prompt = DataPrompt.products_for_category(category, features, k)
|
107 |
response = openai.ChatCompletion.create(
|
@@ -118,7 +116,7 @@ def generate_products(category: str, features: List[str], k: int = 20):
|
|
118 |
|
119 |
def category_product_file(category: str) -> str:
|
120 |
"""
|
121 |
-
|
122 |
"""
|
123 |
output_file_name = f"products_{category.lower().replace(' ', '_')}.json"
|
124 |
return os.path.join(data_dir, 'json', output_file_name)
|
@@ -263,7 +261,7 @@ def generate_reviews_for_category(category: str, target_count: int):
|
|
263 |
"""
|
264 |
Generate reviews for a specific category up to a given target number of reviews
|
265 |
"""
|
266 |
-
batch_size = 25 # Max number of reviews to request in one go from GPT
|
267 |
|
268 |
# Set up a loop to continue trying to find more work to do until complete
|
269 |
working = True
|
@@ -342,4 +340,17 @@ def add_reviews_to_product(reviews_json: str, product: Product):
|
|
342 |
# run generate_reviews_for_category(50) for each category # Generate 50 reviews per product in every category
|
343 |
"""
|
344 |
if __name__ == "__main__":
|
345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
class Product:
|
27 |
"""
|
28 |
+
Simple representation of a product
|
29 |
"""
|
30 |
def __init__(self, category: str, name: str, description: str, price: float, features: List[str], reviews: List[Review]):
|
31 |
self.category = category
|
|
|
38 |
|
39 |
class DataPrompt:
|
40 |
"""
|
41 |
+
Class as a Name Space to hold prompts used in the data generation process
|
42 |
"""
|
43 |
@staticmethod
|
44 |
def prompt_setup() -> str:
|
45 |
return "You are a marketing assistant for consumer home electronics manufacturer ElectroHome. You are polite and succinct.\n\n"
|
46 |
|
|
|
47 |
@staticmethod
|
48 |
def prompt_setup_user() -> str:
|
49 |
return "You are a customer of consumer home electronics manufacturer ElectroHome, and are reviewing a product you have purchased and used.\n\n"
|
50 |
|
|
|
51 |
@staticmethod
|
52 |
def products_for_category(category: str, features: List[str], k: int) -> str:
|
53 |
existing_products = product_names_for_category(category)
|
|
|
99 |
|
100 |
def generate_products(category: str, features: List[str], k: int = 20):
|
101 |
"""
|
102 |
+
Call GPT3.5 Turbo model and get it to generate some products based on a category
|
103 |
"""
|
104 |
prompt = DataPrompt.products_for_category(category, features, k)
|
105 |
response = openai.ChatCompletion.create(
|
|
|
116 |
|
117 |
def category_product_file(category: str) -> str:
|
118 |
"""
|
119 |
+
Get the file containing products in a category
|
120 |
"""
|
121 |
output_file_name = f"products_{category.lower().replace(' ', '_')}.json"
|
122 |
return os.path.join(data_dir, 'json', output_file_name)
|
|
|
261 |
"""
|
262 |
Generate reviews for a specific category up to a given target number of reviews
|
263 |
"""
|
264 |
+
batch_size = 25 # Max number of reviews to request in one go from GPT so as not to overflow the token limit
|
265 |
|
266 |
# Set up a loop to continue trying to find more work to do until complete
|
267 |
working = True
|
|
|
340 |
# run generate_reviews_for_category(50) for each category # Generate 50 reviews per product in every category
|
341 |
"""
|
342 |
if __name__ == "__main__":
|
343 |
+
# Step 1 - manual - not shown here. See /data/json/product_categories.json and the /data/json/product_features.json files for the result
|
344 |
+
|
345 |
+
# Step 2 - generate the products within each category
|
346 |
+
# generate_all_products()
|
347 |
+
|
348 |
+
# Step 3 - dump the products to a CSV file for a manual check
|
349 |
+
# dump_products_to_csv()
|
350 |
+
|
351 |
+
# Step 4 - review and tweak names - manual - results are in the in the products_category.json files
|
352 |
+
|
353 |
+
# Step 5 - generate reviews for every product in each category (1 category at a time). Note run in parallel from command line.
|
354 |
+
# generate_reviews_for_category(sys.argv[1], int(sys.argv[2]))
|
355 |
+
|
356 |
+
print("No steps set up to run to avoid over-writing data. Please edit the file generate_data.py if you want to re-run generation")
|