Spaces:

alfraser
/

llm-arch

Runtime error

App Files Files Community

alfraser commited on Jan 22, 2024

Commit

acb7b9c

1 Parent(s): 51d7c53

Tidied up comments

Browse files

Files changed (1) hide show

src/data_synthesis/generate_data.py +53 -1

src/data_synthesis/generate_data.py CHANGED Viewed

@@ -15,12 +15,18 @@ from src.common import data_dir
 class Review:
     def __init__(self, stars: int, review_text: str):
         self.stars = stars
         self.review_text = review_text
 class Product:
     def __init__(self, category: str, name: str, description: str, price: float, features: List[str], reviews: List[Review]):
         self.category = category
         self.name = name
@@ -32,7 +38,7 @@ class Product:
 class DataPrompt:
     """
-    Holder for static prompt generation functions
     """
     @staticmethod
     def prompt_setup() -> str:
@@ -94,6 +100,9 @@ Please format the response as json in this style:
 def generate_products(category: str, features: List[str], k: int = 20):
     prompt = DataPrompt.products_for_category(category, features, k)
     response = openai.ChatCompletion.create(
         model="gpt-3.5-turbo-16k",
@@ -108,16 +117,27 @@ def generate_products(category: str, features: List[str], k: int = 20):
 def category_product_file(category: str) -> str:
     output_file_name = f"products_{category.lower().replace(' ', '_')}.json"
     return os.path.join(data_dir, 'json', output_file_name)
 def category_review_file(category: str) -> str:
     output_file_name = f"reviews_{category.lower().replace(' ', '_')}.json"
     return os.path.join(data_dir, 'json', output_file_name)
 def products_for_category(category: str) -> List[Product]:
     cat_file = category_product_file(category)
     if not os.path.exists(cat_file):
         return []
@@ -141,6 +161,10 @@ def products_for_category(category: str) -> List[Product]:
 def product_names_for_category(category: str) -> List[str]:
     cat_file = category_product_file(category)
     if not os.path.exists(cat_file):
         return []
@@ -154,6 +178,10 @@ def product_names_for_category(category: str) -> List[str]:
 def add_products(category: str, product_json: str, k: int) -> None:
     cat_file = category_product_file(category)
     if not os.path.exists(cat_file):
         with open(cat_file, 'w') as f:
@@ -173,6 +201,10 @@ def add_products(category: str, product_json: str, k: int) -> None:
 def get_categories_and_features() -> Dict[str, List[str]]:
     product_features_file = os.path.join(data_dir, 'json', 'product_features.json')
     cats_and_feats = {}
     with open(product_features_file, 'r') as f:
@@ -185,6 +217,10 @@ def get_categories_and_features() -> Dict[str, List[str]]:
 def generate_all_products(target_count=40):
     product_features_file = os.path.join(data_dir, 'product_features.json')
     with open(product_features_file, 'r') as f:
@@ -202,6 +238,9 @@ def generate_all_products(target_count=40):
 def dump_products_to_csv():
     cats = get_categories_and_features().keys()
     cat_keys = []
     for cat in cats:
@@ -213,11 +252,17 @@ def dump_products_to_csv():
 def generate_reviews(target_count: int):
     for cat in get_categories_and_features().keys():
         generate_reviews_for_category(cat, target_count)
 def generate_reviews_for_category(category: str, target_count: int):
     batch_size = 25  # Max number of reviews to request in one go from GPT
     # Set up a loop to continue trying to find more work to do until complete
@@ -249,6 +294,9 @@ def generate_reviews_for_category(category: str, target_count: int):
 def generate_reviews_for_product(product: Product, k: int):
     prompt = DataPrompt.reviews_for_product(product, k)
     response = openai.ChatCompletion.create(
         model="gpt-3.5-turbo-16k",
@@ -263,6 +311,10 @@ def generate_reviews_for_product(product: Product, k: int):
 def add_reviews_to_product(reviews_json: str, product: Product):
     reviews_json = json.loads(reviews_json)
     reviews_file = category_review_file(product.category)
     if not os.path.exists(reviews_file):

 class Review:
+    """
+    Simple representation of a user Review of a Product
+    """
     def __init__(self, stars: int, review_text: str):
         self.stars = stars
         self.review_text = review_text
 class Product:
+    """
+    Simple representation of a prduct
+    """
     def __init__(self, category: str, name: str, description: str, price: float, features: List[str], reviews: List[Review]):
         self.category = category
         self.name = name
 class DataPrompt:
     """
+    Holder for static prompt generation functions for the data generation process
     """
     @staticmethod
     def prompt_setup() -> str:
 def generate_products(category: str, features: List[str], k: int = 20):
+    """
+    Invoke GPT3.5 Turbo model and get it to generate some products based on a category
+    """
     prompt = DataPrompt.products_for_category(category, features, k)
     response = openai.ChatCompletion.create(
         model="gpt-3.5-turbo-16k",
 def category_product_file(category: str) -> str:
+    """
+    Utility to get the file containing products in a category
+    """
     output_file_name = f"products_{category.lower().replace(' ', '_')}.json"
     return os.path.join(data_dir, 'json', output_file_name)
 def category_review_file(category: str) -> str:
+    """
+    Utility to get the file containing reviews of products in a category
+    """
     output_file_name = f"reviews_{category.lower().replace(' ', '_')}.json"
     return os.path.join(data_dir, 'json', output_file_name)
 def products_for_category(category: str) -> List[Product]:
+    """
+    Load all the associated products which have been generated for this
+    category, and the reviews, then merge the two and return a list of
+    all the products in this category along with their reviews
+    """
     cat_file = category_product_file(category)
     if not os.path.exists(cat_file):
         return []
 def product_names_for_category(category: str) -> List[str]:
+    """
+    Get a list of just the names of the products in this category
+    from the generated product json file
+    """
     cat_file = category_product_file(category)
     if not os.path.exists(cat_file):
         return []
 def add_products(category: str, product_json: str, k: int) -> None:
+    """
+    Given a string of json representing newly generated products,
+    add those products to the existing product json file for this category
+    """
     cat_file = category_product_file(category)
     if not os.path.exists(cat_file):
         with open(cat_file, 'w') as f:
 def get_categories_and_features() -> Dict[str, List[str]]:
+    """
+    Get dictionary of will each category as a key and the list of available
+    features to products in that category as the value
+    """
     product_features_file = os.path.join(data_dir, 'json', 'product_features.json')
     cats_and_feats = {}
     with open(product_features_file, 'r') as f:
 def generate_all_products(target_count=40):
+    """
+    Generate all products for all categories, trying to reach a given target count
+    of products.
+    """
     product_features_file = os.path.join(data_dir, 'product_features.json')
     with open(product_features_file, 'r') as f:
 def dump_products_to_csv():
+    """
+    Dump a csv file for debug, for every product showing category name and product name
+    """
     cats = get_categories_and_features().keys()
     cat_keys = []
     for cat in cats:
 def generate_reviews(target_count: int):
+    """
+    Generate reviews for each category up to a target count of reviews
+    """
     for cat in get_categories_and_features().keys():
         generate_reviews_for_category(cat, target_count)
 def generate_reviews_for_category(category: str, target_count: int):
+    """
+    Generate reviews for a specific category up to a given target number of reviews
+    """
     batch_size = 25  # Max number of reviews to request in one go from GPT
     # Set up a loop to continue trying to find more work to do until complete
 def generate_reviews_for_product(product: Product, k: int):
+    """
+    Generate a number of reviews from GPT3.5 for a specific product and add them to the product
+    """
     prompt = DataPrompt.reviews_for_product(product, k)
     response = openai.ChatCompletion.create(
         model="gpt-3.5-turbo-16k",
 def add_reviews_to_product(reviews_json: str, product: Product):
+    """
+    Load the reviews file containing this product category, append this review to the list and
+    re-save the file
+    """
     reviews_json = json.loads(reviews_json)
     reviews_file = category_review_file(product.category)
     if not os.path.exists(reviews_file):