Spaces:

alfraser
/

llm-arch

Runtime error

App Files Files Community

alfraser commited on Mar 4, 2024

Commit

a1317da

1 Parent(s): 507ac8e

Tidied up generate_data.py

Browse files

Files changed (1) hide show

src/data_synthesis/generate_data.py +19 -8

src/data_synthesis/generate_data.py CHANGED Viewed

@@ -25,7 +25,7 @@ class Review:
 class Product:
     """
-    Simple representation of a prduct
     """
     def __init__(self, category: str, name: str, description: str, price: float, features: List[str], reviews: List[Review]):
         self.category = category
@@ -38,18 +38,16 @@ class Product:
 class DataPrompt:
     """
-    Holder for static prompt generation functions for the data generation process
     """
     @staticmethod
     def prompt_setup() -> str:
         return "You are a marketing assistant for consumer home electronics manufacturer ElectroHome. You are polite and succinct.\n\n"
     @staticmethod
     def prompt_setup_user() -> str:
         return "You are a customer of consumer home electronics manufacturer ElectroHome, and are reviewing a product you have purchased and used.\n\n"
     @staticmethod
     def products_for_category(category: str, features: List[str], k: int) -> str:
         existing_products = product_names_for_category(category)
@@ -101,7 +99,7 @@ Please format the response as json in this style:
 def generate_products(category: str, features: List[str], k: int = 20):
     """
-    Invoke GPT3.5 Turbo model and get it to generate some products based on a category
     """
     prompt = DataPrompt.products_for_category(category, features, k)
     response = openai.ChatCompletion.create(
@@ -118,7 +116,7 @@ def generate_products(category: str, features: List[str], k: int = 20):
 def category_product_file(category: str) -> str:
     """
-    Utility to get the file containing products in a category
     """
     output_file_name = f"products_{category.lower().replace(' ', '_')}.json"
     return os.path.join(data_dir, 'json', output_file_name)
@@ -263,7 +261,7 @@ def generate_reviews_for_category(category: str, target_count: int):
     """
     Generate reviews for a specific category up to a given target number of reviews
     """
-    batch_size = 25  # Max number of reviews to request in one go from GPT
     # Set up a loop to continue trying to find more work to do until complete
     working = True
@@ -342,4 +340,17 @@ def add_reviews_to_product(reviews_json: str, product: Product):
 # run generate_reviews_for_category(50) for each category  # Generate 50 reviews per product in every category
 """
 if __name__ == "__main__":
-    generate_reviews_for_category(sys.argv[1], int(sys.argv[2]))

 class Product:
     """
+    Simple representation of a product
     """
     def __init__(self, category: str, name: str, description: str, price: float, features: List[str], reviews: List[Review]):
         self.category = category
 class DataPrompt:
     """
+    Class as a Name Space to hold prompts used in the data generation process
     """
     @staticmethod
     def prompt_setup() -> str:
         return "You are a marketing assistant for consumer home electronics manufacturer ElectroHome. You are polite and succinct.\n\n"
     @staticmethod
     def prompt_setup_user() -> str:
         return "You are a customer of consumer home electronics manufacturer ElectroHome, and are reviewing a product you have purchased and used.\n\n"
     @staticmethod
     def products_for_category(category: str, features: List[str], k: int) -> str:
         existing_products = product_names_for_category(category)
 def generate_products(category: str, features: List[str], k: int = 20):
     """
+    Call GPT3.5 Turbo model and get it to generate some products based on a category
     """
     prompt = DataPrompt.products_for_category(category, features, k)
     response = openai.ChatCompletion.create(
 def category_product_file(category: str) -> str:
     """
+    Get the file containing products in a category
     """
     output_file_name = f"products_{category.lower().replace(' ', '_')}.json"
     return os.path.join(data_dir, 'json', output_file_name)
     """
     Generate reviews for a specific category up to a given target number of reviews
     """
+    batch_size = 25  # Max number of reviews to request in one go from GPT so as not to overflow the token limit
     # Set up a loop to continue trying to find more work to do until complete
     working = True
 # run generate_reviews_for_category(50) for each category  # Generate 50 reviews per product in every category
 """
 if __name__ == "__main__":
+    # Step 1 - manual - not shown here.  See /data/json/product_categories.json and the /data/json/product_features.json files for the result
+    # Step 2 - generate the products within each category
+    # generate_all_products()
+    # Step 3 - dump the products to a CSV file for a manual check
+    # dump_products_to_csv()
+    # Step 4 - review and tweak names - manual - results are in the in the products_category.json files
+    # Step 5 - generate reviews for every product in each category (1 category at a time).  Note run in parallel from command line.
+    # generate_reviews_for_category(sys.argv[1], int(sys.argv[2]))
+    print("No steps set up to run to avoid over-writing data. Please edit the file generate_data.py if you want to re-run generation")