Spaces:
Runtime error
Runtime error
Tidied up generate_data.py
Browse files
src/data_synthesis/generate_data.py
CHANGED
|
@@ -25,7 +25,7 @@ class Review:
|
|
| 25 |
|
| 26 |
class Product:
|
| 27 |
"""
|
| 28 |
-
Simple representation of a
|
| 29 |
"""
|
| 30 |
def __init__(self, category: str, name: str, description: str, price: float, features: List[str], reviews: List[Review]):
|
| 31 |
self.category = category
|
|
@@ -38,18 +38,16 @@ class Product:
|
|
| 38 |
|
| 39 |
class DataPrompt:
|
| 40 |
"""
|
| 41 |
-
|
| 42 |
"""
|
| 43 |
@staticmethod
|
| 44 |
def prompt_setup() -> str:
|
| 45 |
return "You are a marketing assistant for consumer home electronics manufacturer ElectroHome. You are polite and succinct.\n\n"
|
| 46 |
|
| 47 |
-
|
| 48 |
@staticmethod
|
| 49 |
def prompt_setup_user() -> str:
|
| 50 |
return "You are a customer of consumer home electronics manufacturer ElectroHome, and are reviewing a product you have purchased and used.\n\n"
|
| 51 |
|
| 52 |
-
|
| 53 |
@staticmethod
|
| 54 |
def products_for_category(category: str, features: List[str], k: int) -> str:
|
| 55 |
existing_products = product_names_for_category(category)
|
|
@@ -101,7 +99,7 @@ Please format the response as json in this style:
|
|
| 101 |
|
| 102 |
def generate_products(category: str, features: List[str], k: int = 20):
|
| 103 |
"""
|
| 104 |
-
|
| 105 |
"""
|
| 106 |
prompt = DataPrompt.products_for_category(category, features, k)
|
| 107 |
response = openai.ChatCompletion.create(
|
|
@@ -118,7 +116,7 @@ def generate_products(category: str, features: List[str], k: int = 20):
|
|
| 118 |
|
| 119 |
def category_product_file(category: str) -> str:
|
| 120 |
"""
|
| 121 |
-
|
| 122 |
"""
|
| 123 |
output_file_name = f"products_{category.lower().replace(' ', '_')}.json"
|
| 124 |
return os.path.join(data_dir, 'json', output_file_name)
|
|
@@ -263,7 +261,7 @@ def generate_reviews_for_category(category: str, target_count: int):
|
|
| 263 |
"""
|
| 264 |
Generate reviews for a specific category up to a given target number of reviews
|
| 265 |
"""
|
| 266 |
-
batch_size = 25 # Max number of reviews to request in one go from GPT
|
| 267 |
|
| 268 |
# Set up a loop to continue trying to find more work to do until complete
|
| 269 |
working = True
|
|
@@ -342,4 +340,17 @@ def add_reviews_to_product(reviews_json: str, product: Product):
|
|
| 342 |
# run generate_reviews_for_category(50) for each category # Generate 50 reviews per product in every category
|
| 343 |
"""
|
| 344 |
if __name__ == "__main__":
|
| 345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
class Product:
|
| 27 |
"""
|
| 28 |
+
Simple representation of a product
|
| 29 |
"""
|
| 30 |
def __init__(self, category: str, name: str, description: str, price: float, features: List[str], reviews: List[Review]):
|
| 31 |
self.category = category
|
|
|
|
| 38 |
|
| 39 |
class DataPrompt:
|
| 40 |
"""
|
| 41 |
+
Class as a Name Space to hold prompts used in the data generation process
|
| 42 |
"""
|
| 43 |
@staticmethod
|
| 44 |
def prompt_setup() -> str:
|
| 45 |
return "You are a marketing assistant for consumer home electronics manufacturer ElectroHome. You are polite and succinct.\n\n"
|
| 46 |
|
|
|
|
| 47 |
@staticmethod
|
| 48 |
def prompt_setup_user() -> str:
|
| 49 |
return "You are a customer of consumer home electronics manufacturer ElectroHome, and are reviewing a product you have purchased and used.\n\n"
|
| 50 |
|
|
|
|
| 51 |
@staticmethod
|
| 52 |
def products_for_category(category: str, features: List[str], k: int) -> str:
|
| 53 |
existing_products = product_names_for_category(category)
|
|
|
|
| 99 |
|
| 100 |
def generate_products(category: str, features: List[str], k: int = 20):
|
| 101 |
"""
|
| 102 |
+
Call GPT3.5 Turbo model and get it to generate some products based on a category
|
| 103 |
"""
|
| 104 |
prompt = DataPrompt.products_for_category(category, features, k)
|
| 105 |
response = openai.ChatCompletion.create(
|
|
|
|
| 116 |
|
| 117 |
def category_product_file(category: str) -> str:
|
| 118 |
"""
|
| 119 |
+
Get the file containing products in a category
|
| 120 |
"""
|
| 121 |
output_file_name = f"products_{category.lower().replace(' ', '_')}.json"
|
| 122 |
return os.path.join(data_dir, 'json', output_file_name)
|
|
|
|
| 261 |
"""
|
| 262 |
Generate reviews for a specific category up to a given target number of reviews
|
| 263 |
"""
|
| 264 |
+
batch_size = 25 # Max number of reviews to request in one go from GPT so as not to overflow the token limit
|
| 265 |
|
| 266 |
# Set up a loop to continue trying to find more work to do until complete
|
| 267 |
working = True
|
|
|
|
| 340 |
# run generate_reviews_for_category(50) for each category # Generate 50 reviews per product in every category
|
| 341 |
"""
|
| 342 |
if __name__ == "__main__":
|
| 343 |
+
# Step 1 - manual - not shown here. See /data/json/product_categories.json and the /data/json/product_features.json files for the result
|
| 344 |
+
|
| 345 |
+
# Step 2 - generate the products within each category
|
| 346 |
+
# generate_all_products()
|
| 347 |
+
|
| 348 |
+
# Step 3 - dump the products to a CSV file for a manual check
|
| 349 |
+
# dump_products_to_csv()
|
| 350 |
+
|
| 351 |
+
# Step 4 - review and tweak names - manual - results are in the in the products_category.json files
|
| 352 |
+
|
| 353 |
+
# Step 5 - generate reviews for every product in each category (1 category at a time). Note run in parallel from command line.
|
| 354 |
+
# generate_reviews_for_category(sys.argv[1], int(sys.argv[2]))
|
| 355 |
+
|
| 356 |
+
print("No steps set up to run to avoid over-writing data. Please edit the file generate_data.py if you want to re-run generation")
|