Spaces:
Runtime error
Runtime error
Reviewed for comments and type hints
Browse files
src/data_synthesis/select_test_data_from_all_products.py
CHANGED
@@ -32,6 +32,9 @@ def dst_conn() -> sqlite3.Connection:
|
|
32 |
|
33 |
|
34 |
def copy_all_products_db() -> None:
|
|
|
|
|
|
|
35 |
shutil.copy(src_db(), dst_db())
|
36 |
|
37 |
|
@@ -42,7 +45,10 @@ def join_strings_for_sql(items: List[str]) -> str:
|
|
42 |
return ', '.join([f'"{i}"' for i in items])
|
43 |
|
44 |
|
45 |
-
def execute_sqls(sql_statements: List[str]):
|
|
|
|
|
|
|
46 |
db_connection = dst_conn()
|
47 |
db_cursor = db_connection.cursor()
|
48 |
for s in sql_statements:
|
@@ -51,6 +57,10 @@ def execute_sqls(sql_statements: List[str]):
|
|
51 |
|
52 |
|
53 |
def execute_select(sql_statement: str) -> List[List]:
|
|
|
|
|
|
|
|
|
54 |
db_connection = dst_conn()
|
55 |
db_cursor = db_connection.cursor()
|
56 |
db_cursor.execute(sql_statement)
|
@@ -58,7 +68,10 @@ def execute_select(sql_statement: str) -> List[List]:
|
|
58 |
|
59 |
|
60 |
def drop_categories(keeping: List[str]) -> None:
|
61 |
-
|
|
|
|
|
|
|
62 |
keep_in = f"({join_strings_for_sql(keeping)})"
|
63 |
sql_statements = []
|
64 |
sql = f"delete from reviews where product_id in (select id from products where category_id in (select id from categories where name not in {keep_in}))"
|
@@ -76,6 +89,11 @@ def drop_categories(keeping: List[str]) -> None:
|
|
76 |
|
77 |
|
78 |
def drop_products(ids: List[int]) -> None:
|
|
|
|
|
|
|
|
|
|
|
79 |
ids_in = f"({join_strings_for_sql(ids)})"
|
80 |
|
81 |
sql_statements = []
|
@@ -88,7 +106,10 @@ def drop_products(ids: List[int]) -> None:
|
|
88 |
execute_sqls(sql_statements)
|
89 |
|
90 |
|
91 |
-
def winnow_prods_per_category(min_count: int = 10, max_count: int = 25):
|
|
|
|
|
|
|
92 |
sql = "select name from categories"
|
93 |
categories = [r[0] for r in execute_select(sql)]
|
94 |
for c in categories:
|
@@ -105,12 +126,22 @@ def winnow_prods_per_category(min_count: int = 10, max_count: int = 25):
|
|
105 |
|
106 |
|
107 |
def avg_rating(review_ratings1: List[List[int]], review_ratings2: List[List[int]]) -> float:
|
|
|
|
|
|
|
108 |
review_count = len(review_ratings1) + len(review_ratings2)
|
109 |
review_sum = sum([r[1] for r in review_ratings1]) + sum([r[1] for r in review_ratings2])
|
110 |
return review_sum / review_count
|
111 |
|
112 |
|
113 |
def get_review_ids_to_drop(review_ratings: List[List[int]], target_review_count: int, target_avg_rating: float) -> List[int]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
ids_to_drop = []
|
115 |
rated_lower_than_target = [r for r in review_ratings if r[1] <= target_avg_rating]
|
116 |
rated_higher_than_target = [r for r in review_ratings if r[1] > target_avg_rating]
|
@@ -128,7 +159,10 @@ def get_review_ids_to_drop(review_ratings: List[List[int]], target_review_count:
|
|
128 |
return ids_to_drop
|
129 |
|
130 |
|
131 |
-
def drop_reviews(review_ids: list[int]):
|
|
|
|
|
|
|
132 |
if len(review_ids) == 0:
|
133 |
print("Warning - got zero reviews to drop")
|
134 |
else:
|
@@ -141,6 +175,10 @@ def drop_reviews_to_balance_avg_rating(min_review_count: int = 5,
|
|
141 |
max_review_count: int = 30,
|
142 |
min_target_avg_rating: float = 3.4,
|
143 |
max_target_avg_rating: float = 4.9):
|
|
|
|
|
|
|
|
|
144 |
sql = "select id, name from products"
|
145 |
product_id_names = [(r[0], r[1]) for r in execute_select(sql)]
|
146 |
for prod_id, prod_name in product_id_names:
|
|
|
32 |
|
33 |
|
34 |
def copy_all_products_db() -> None:
|
35 |
+
"""
|
36 |
+
Copies the source to the target database via a simple file copy.
|
37 |
+
"""
|
38 |
shutil.copy(src_db(), dst_db())
|
39 |
|
40 |
|
|
|
45 |
return ', '.join([f'"{i}"' for i in items])
|
46 |
|
47 |
|
48 |
+
def execute_sqls(sql_statements: List[str]) -> None:
|
49 |
+
"""
|
50 |
+
Executes a number of sql statements against the destination DB
|
51 |
+
"""
|
52 |
db_connection = dst_conn()
|
53 |
db_cursor = db_connection.cursor()
|
54 |
for s in sql_statements:
|
|
|
57 |
|
58 |
|
59 |
def execute_select(sql_statement: str) -> List[List]:
|
60 |
+
"""
|
61 |
+
Executes a single sql statement against the destination DB and returns the
|
62 |
+
complete associated resultset
|
63 |
+
"""
|
64 |
db_connection = dst_conn()
|
65 |
db_cursor = db_connection.cursor()
|
66 |
db_cursor.execute(sql_statement)
|
|
|
68 |
|
69 |
|
70 |
def drop_categories(keeping: List[str]) -> None:
|
71 |
+
"""
|
72 |
+
Drops whole categories from the destination DB. Only keeps the ones named in the list.
|
73 |
+
Drops from tables seuqentially to respect foreign key constraints.
|
74 |
+
"""
|
75 |
keep_in = f"({join_strings_for_sql(keeping)})"
|
76 |
sql_statements = []
|
77 |
sql = f"delete from reviews where product_id in (select id from products where category_id in (select id from categories where name not in {keep_in}))"
|
|
|
89 |
|
90 |
|
91 |
def drop_products(ids: List[int]) -> None:
|
92 |
+
"""
|
93 |
+
Drops a number of products from the destination DB based on IDs. Drops from tables sequentially to respect foreign
|
94 |
+
key constraints
|
95 |
+
:param ids: the IDs of the products to drop
|
96 |
+
"""
|
97 |
ids_in = f"({join_strings_for_sql(ids)})"
|
98 |
|
99 |
sql_statements = []
|
|
|
106 |
execute_sqls(sql_statements)
|
107 |
|
108 |
|
109 |
+
def winnow_prods_per_category(min_count: int = 10, max_count: int = 25) -> None:
|
110 |
+
"""
|
111 |
+
Thins out the number of products in each category so the number falls somewhere (randomly) within a target range
|
112 |
+
"""
|
113 |
sql = "select name from categories"
|
114 |
categories = [r[0] for r in execute_select(sql)]
|
115 |
for c in categories:
|
|
|
126 |
|
127 |
|
128 |
def avg_rating(review_ratings1: List[List[int]], review_ratings2: List[List[int]]) -> float:
|
129 |
+
"""
|
130 |
+
Calculates the average rating across two lists of reviews
|
131 |
+
"""
|
132 |
review_count = len(review_ratings1) + len(review_ratings2)
|
133 |
review_sum = sum([r[1] for r in review_ratings1]) + sum([r[1] for r in review_ratings2])
|
134 |
return review_sum / review_count
|
135 |
|
136 |
|
137 |
def get_review_ids_to_drop(review_ratings: List[List[int]], target_review_count: int, target_avg_rating: float) -> List[int]:
|
138 |
+
"""
|
139 |
+
Selects a set of reviews to drop from the dataset whilst trying to meet two objectives. First is to get close to
|
140 |
+
a target average review value (i.e. review stars for this product). Second is to do this while arriving at a
|
141 |
+
requested number of overall reviews. The approach is to split the reviews to higher than target / lower than target
|
142 |
+
and then repeatedly randomly select either a higher than target or lower than target individual review to
|
143 |
+
move the average in the right direction and get an approximately good solution.
|
144 |
+
"""
|
145 |
ids_to_drop = []
|
146 |
rated_lower_than_target = [r for r in review_ratings if r[1] <= target_avg_rating]
|
147 |
rated_higher_than_target = [r for r in review_ratings if r[1] > target_avg_rating]
|
|
|
159 |
return ids_to_drop
|
160 |
|
161 |
|
162 |
+
def drop_reviews(review_ids: list[int]) -> None:
|
163 |
+
"""
|
164 |
+
Drops specific review IDs from the destination DB.
|
165 |
+
"""
|
166 |
if len(review_ids) == 0:
|
167 |
print("Warning - got zero reviews to drop")
|
168 |
else:
|
|
|
175 |
max_review_count: int = 30,
|
176 |
min_target_avg_rating: float = 3.4,
|
177 |
max_target_avg_rating: float = 4.9):
|
178 |
+
"""
|
179 |
+
For each product generates a target rating in a range and a target number of reviews in a range
|
180 |
+
and then runs a process to try and get close to a solution for that product.
|
181 |
+
"""
|
182 |
sql = "select id, name from products"
|
183 |
product_id_names = [(r[0], r[1]) for r in execute_select(sql)]
|
184 |
for prod_id, prod_name in product_id_names:
|