Spaces:

alfraser
/

llm-arch

Runtime error

App Files Files Community

alfraser commited on Jan 23, 2024

Commit

7e353fe

1 Parent(s): 5c71a71

Added the script to shape the data for testing and the associated sqlite containg the test data

Browse files

Files changed (3) hide show

data/sqlite/02_baseline_products_dataset.db +1 -1
data/vector_stores/products_tvs_chroma/08d5b637-758f-478e-9873-811a4c46eaff/link_lists.bin +0 -0
src/data_synthesis/select_test_data_from_all_products.py +173 -0

data/sqlite/02_baseline_products_dataset.db CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a5ece96e2b662df011c9896e4c276053296e6ea28a8d207bdf19c5219734585d
 size 17260544

 version https://git-lfs.github.com/spec/v1
+oid sha256:c614f87d479e5ffd3dab7ec185811dc59c52a27a41eed7c5788f23674c6d77fd
 size 17260544

data/vector_stores/products_tvs_chroma/08d5b637-758f-478e-9873-811a4c46eaff/link_lists.bin DELETED Viewed

File without changes

src/data_synthesis/select_test_data_from_all_products.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""
+This file is a script intended to be run offline, not as part of the application
+This script takes the initially generated set of "all_products" and refines it
+down to a more curated test set.  The following are the goals in refining this down
+- size and efficiency: reduce the amount of data to speed up training and iteration on the core part
+of the project which is architecture assessment
+- produce more realistic data: produce a more realistic distribution of data.  For example the generation
+process being statistical has clustered a lot of products around a 3.5 rating
+- hold some data back for specific tests - e.g. if I want to test subsequent addition of data
+"""
+import os
+from random import randint, random, choices, shuffle
+import shutil
+import sqlite3
+from typing import List
+from src.common import data_dir
+def src_db() -> str:
+    return os.path.join(data_dir, 'sqlite', '01_all_products_dataset.db')
+def dst_db() -> str:
+    return os.path.join(data_dir, 'sqlite', '02_baseline_products_dataset.db')
+def dst_conn() -> sqlite3.Connection:
+    return sqlite3.connect(dst_db())
+def copy_all_products_db() -> None:
+    shutil.copy(src_db(), dst_db())
+def join_strings_for_sql(items: List[str]) -> str:
+    """
+    Joins a list comma separated and enclosed in double quotes for use in an sql statement
+    """
+    return ', '.join([f'"{i}"' for i in items])
+def execute_sqls(sql_statements: List[str]):
+    db_connection = dst_conn()
+    db_cursor = db_connection.cursor()
+    for s in sql_statements:
+        db_cursor.execute(s)
+    db_connection.commit()
+def execute_select(sql_statement: str) -> List[List]:
+    db_connection = dst_conn()
+    db_cursor = db_connection.cursor()
+    db_cursor.execute(sql_statement)
+    return db_cursor.fetchall()
+def drop_categories(keeping: List[str]) -> None:
+    # Drop from the tables sequentially to avoid foreign key constraint issues
+    keep_in = f"({join_strings_for_sql(keeping)})"
+    sql_statements = []
+    sql = f"delete from reviews where product_id in (select id from products where category_id in (select id from categories where name not in {keep_in}))"
+    sql_statements.append(sql)
+    sql = f"delete from product_features where product_id in (select id from products where category_id in (select id from categories where name not in {keep_in}))"
+    sql_statements.append(sql)
+    sql = f"delete from products where id in (select id from products where category_id in (select id from categories where name not in {keep_in}))"
+    sql_statements.append(sql)
+    sql = f"delete from features where category_id in (select id from categories where name not in {keep_in})"
+    sql_statements.append(sql)
+    sql = f"delete from categories where name not in {keep_in}"
+    sql_statements.append(sql)
+    execute_sqls(sql_statements)
+    print(f'Narrowed categories down to {len(keeping)}')
+def drop_products(ids: List[int]) -> None:
+    ids_in = f"({join_strings_for_sql(ids)})"
+    sql_statements = []
+    sql = f"delete from reviews where product_id in {ids_in}"
+    sql_statements.append(sql)
+    sql = f"delete from product_features where product_id in {ids_in}"
+    sql_statements.append(sql)
+    sql = f"delete from products where id in {ids_in}"
+    sql_statements.append(sql)
+    execute_sqls(sql_statements)
+def winnow_prods_per_category(min_count: int = 10, max_count: int = 25):
+    sql = "select name from categories"
+    categories = [r[0] for r in execute_select(sql)]
+    for c in categories:
+        target_prod_count = randint(min_count, max_count)
+        sql = f'select id from products where category_id in (select id from categories where name = "{c}")'
+        current_ids = [r[0] for r in execute_select(sql)]
+        if len(current_ids) > target_prod_count:
+            num_to_winnow = len(current_ids) - target_prod_count
+            ids_to_drop = choices(current_ids, k=num_to_winnow)
+            drop_products(ids_to_drop)
+            print(f'Winnowed {c} to {target_prod_count} products')
+        else:
+            print(f'{c} already at {len(current_ids)} - nothing to winnow')
+def avg_rating(review_ratings1: List[List[int]], review_ratings2: List[List[int]]) -> float:
+    review_count = len(review_ratings1) + len(review_ratings2)
+    review_sum = sum([r[1] for r in review_ratings1]) + sum([r[1] for r in review_ratings2])
+    return review_sum / review_count
+def get_review_ids_to_drop(review_ratings: List[List[int]], target_review_count: int, target_avg_rating: float) -> List[int]:
+    ids_to_drop = []
+    rated_lower_than_target = [r for r in review_ratings if r[1] >= target_avg_rating]
+    rated_higher_than_target = [r for r in review_ratings if r[1] < target_avg_rating]
+    while len(rated_higher_than_target) + len(rated_lower_than_target) > target_review_count:
+        if avg_rating(rated_higher_than_target, rated_lower_than_target) >= target_avg_rating:
+            if len(rated_higher_than_target) == 0:
+                break  # Avoid getting stuck in a loop when there's no remaining way to reach the target
+            shuffle(rated_higher_than_target)
+            ids_to_drop.append(rated_higher_than_target.pop()[0])
+        else:
+            if len(rated_lower_than_target) == 0:
+                break  # Avoid getting stuck in a loop when there's no remaining way to reach the target
+            shuffle(rated_lower_than_target)
+            ids_to_drop.append(rated_lower_than_target.pop()[0])
+    return ids_to_drop
+def drop_reviews(review_ids: list[int]):
+    if len(review_ids) == 0:
+        print("Warning - got zero reviews to drop")
+    else:
+        ids_in = f'({", ".join([str(r) for r in review_ids])})'
+        sql = f'delete from reviews where id in {ids_in}'
+def drop_reviews_to_balance_avg_rating(min_review_count: int = 5,
+                                       max_review_count: int = 30,
+                                       min_target_avg_rating: float = 3.4,
+                                       max_target_avg_rating: float = 4.9):
+    sql = "select id, name from products"
+    product_id_names = [(r[0], r[1]) for r in execute_select(sql)]
+    for prod_id, prod_name in product_id_names:
+        target_review_count = randint(min_review_count, max_review_count)
+        target_avg_rating = min_target_avg_rating + (random() * (max_target_avg_rating - min_target_avg_rating))
+        sql = f'select id, rating from reviews where product_id = {prod_id}'
+        review_ratings = execute_select(sql)
+        review_ids_to_drop = get_review_ids_to_drop(review_ratings, target_review_count, target_avg_rating)
+        print(f'Dropping {len(review_ids_to_drop)} reviews for {prod_name} trying to get to average review of ~{target_avg_rating:.1f}')
+        drop_reviews(review_ids_to_drop)
+if __name__ == "__main__":
+    copy_all_products_db()
+    # Keep half the product categories - more recognisable ones
+    cats = [
+        "Dishwashers",
+        "TVs",
+        "Washing Machines",
+        "Vacuum Cleaners",
+        "Irons",
+        "Electric Kettles",
+        "Microwaves",
+        "Food Processors",
+        "Coffee Machines",
+        "Toasters"
+    ]
+    drop_categories(keeping=cats)
+    winnow_prods_per_category()
+    drop_reviews_to_balance_avg_rating()