alfraser commited on
Commit
a1317da
·
1 Parent(s): 507ac8e

Tidied up generate_data.py

Browse files
Files changed (1) hide show
  1. src/data_synthesis/generate_data.py +19 -8
src/data_synthesis/generate_data.py CHANGED
@@ -25,7 +25,7 @@ class Review:
25
 
26
  class Product:
27
  """
28
- Simple representation of a prduct
29
  """
30
  def __init__(self, category: str, name: str, description: str, price: float, features: List[str], reviews: List[Review]):
31
  self.category = category
@@ -38,18 +38,16 @@ class Product:
38
 
39
  class DataPrompt:
40
  """
41
- Holder for static prompt generation functions for the data generation process
42
  """
43
  @staticmethod
44
  def prompt_setup() -> str:
45
  return "You are a marketing assistant for consumer home electronics manufacturer ElectroHome. You are polite and succinct.\n\n"
46
 
47
-
48
  @staticmethod
49
  def prompt_setup_user() -> str:
50
  return "You are a customer of consumer home electronics manufacturer ElectroHome, and are reviewing a product you have purchased and used.\n\n"
51
 
52
-
53
  @staticmethod
54
  def products_for_category(category: str, features: List[str], k: int) -> str:
55
  existing_products = product_names_for_category(category)
@@ -101,7 +99,7 @@ Please format the response as json in this style:
101
 
102
  def generate_products(category: str, features: List[str], k: int = 20):
103
  """
104
- Invoke GPT3.5 Turbo model and get it to generate some products based on a category
105
  """
106
  prompt = DataPrompt.products_for_category(category, features, k)
107
  response = openai.ChatCompletion.create(
@@ -118,7 +116,7 @@ def generate_products(category: str, features: List[str], k: int = 20):
118
 
119
  def category_product_file(category: str) -> str:
120
  """
121
- Utility to get the file containing products in a category
122
  """
123
  output_file_name = f"products_{category.lower().replace(' ', '_')}.json"
124
  return os.path.join(data_dir, 'json', output_file_name)
@@ -263,7 +261,7 @@ def generate_reviews_for_category(category: str, target_count: int):
263
  """
264
  Generate reviews for a specific category up to a given target number of reviews
265
  """
266
- batch_size = 25 # Max number of reviews to request in one go from GPT
267
 
268
  # Set up a loop to continue trying to find more work to do until complete
269
  working = True
@@ -342,4 +340,17 @@ def add_reviews_to_product(reviews_json: str, product: Product):
342
  # run generate_reviews_for_category(50) for each category # Generate 50 reviews per product in every category
343
  """
344
  if __name__ == "__main__":
345
- generate_reviews_for_category(sys.argv[1], int(sys.argv[2]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  class Product:
27
  """
28
+ Simple representation of a product
29
  """
30
  def __init__(self, category: str, name: str, description: str, price: float, features: List[str], reviews: List[Review]):
31
  self.category = category
 
38
 
39
  class DataPrompt:
40
  """
41
+ Class as a Name Space to hold prompts used in the data generation process
42
  """
43
  @staticmethod
44
  def prompt_setup() -> str:
45
  return "You are a marketing assistant for consumer home electronics manufacturer ElectroHome. You are polite and succinct.\n\n"
46
 
 
47
  @staticmethod
48
  def prompt_setup_user() -> str:
49
  return "You are a customer of consumer home electronics manufacturer ElectroHome, and are reviewing a product you have purchased and used.\n\n"
50
 
 
51
  @staticmethod
52
  def products_for_category(category: str, features: List[str], k: int) -> str:
53
  existing_products = product_names_for_category(category)
 
99
 
100
  def generate_products(category: str, features: List[str], k: int = 20):
101
  """
102
+ Call GPT3.5 Turbo model and get it to generate some products based on a category
103
  """
104
  prompt = DataPrompt.products_for_category(category, features, k)
105
  response = openai.ChatCompletion.create(
 
116
 
117
  def category_product_file(category: str) -> str:
118
  """
119
+ Get the file containing products in a category
120
  """
121
  output_file_name = f"products_{category.lower().replace(' ', '_')}.json"
122
  return os.path.join(data_dir, 'json', output_file_name)
 
261
  """
262
  Generate reviews for a specific category up to a given target number of reviews
263
  """
264
+ batch_size = 25 # Max number of reviews to request in one go from GPT so as not to overflow the token limit
265
 
266
  # Set up a loop to continue trying to find more work to do until complete
267
  working = True
 
340
  # run generate_reviews_for_category(50) for each category # Generate 50 reviews per product in every category
341
  """
342
  if __name__ == "__main__":
343
+ # Step 1 - manual - not shown here. See /data/json/product_categories.json and the /data/json/product_features.json files for the result
344
+
345
+ # Step 2 - generate the products within each category
346
+ # generate_all_products()
347
+
348
+ # Step 3 - dump the products to a CSV file for a manual check
349
+ # dump_products_to_csv()
350
+
351
+ # Step 4 - review and tweak names - manual - results are in the in the products_category.json files
352
+
353
+ # Step 5 - generate reviews for every product in each category (1 category at a time). Note run in parallel from command line.
354
+ # generate_reviews_for_category(sys.argv[1], int(sys.argv[2]))
355
+
356
+ print("No steps set up to run to avoid over-writing data. Please edit the file generate_data.py if you want to re-run generation")