alfraser commited on
Commit
564477a
·
1 Parent(s): 2cb7b84

Reviewed for comments and type hints

Browse files
src/data_synthesis/select_test_data_from_all_products.py CHANGED
@@ -32,6 +32,9 @@ def dst_conn() -> sqlite3.Connection:
32
 
33
 
34
  def copy_all_products_db() -> None:
 
 
 
35
  shutil.copy(src_db(), dst_db())
36
 
37
 
@@ -42,7 +45,10 @@ def join_strings_for_sql(items: List[str]) -> str:
42
  return ', '.join([f'"{i}"' for i in items])
43
 
44
 
45
- def execute_sqls(sql_statements: List[str]):
 
 
 
46
  db_connection = dst_conn()
47
  db_cursor = db_connection.cursor()
48
  for s in sql_statements:
@@ -51,6 +57,10 @@ def execute_sqls(sql_statements: List[str]):
51
 
52
 
53
  def execute_select(sql_statement: str) -> List[List]:
 
 
 
 
54
  db_connection = dst_conn()
55
  db_cursor = db_connection.cursor()
56
  db_cursor.execute(sql_statement)
@@ -58,7 +68,10 @@ def execute_select(sql_statement: str) -> List[List]:
58
 
59
 
60
  def drop_categories(keeping: List[str]) -> None:
61
- # Drop from the tables sequentially to avoid foreign key constraint issues
 
 
 
62
  keep_in = f"({join_strings_for_sql(keeping)})"
63
  sql_statements = []
64
  sql = f"delete from reviews where product_id in (select id from products where category_id in (select id from categories where name not in {keep_in}))"
@@ -76,6 +89,11 @@ def drop_categories(keeping: List[str]) -> None:
76
 
77
 
78
  def drop_products(ids: List[int]) -> None:
 
 
 
 
 
79
  ids_in = f"({join_strings_for_sql(ids)})"
80
 
81
  sql_statements = []
@@ -88,7 +106,10 @@ def drop_products(ids: List[int]) -> None:
88
  execute_sqls(sql_statements)
89
 
90
 
91
- def winnow_prods_per_category(min_count: int = 10, max_count: int = 25):
 
 
 
92
  sql = "select name from categories"
93
  categories = [r[0] for r in execute_select(sql)]
94
  for c in categories:
@@ -105,12 +126,22 @@ def winnow_prods_per_category(min_count: int = 10, max_count: int = 25):
105
 
106
 
107
  def avg_rating(review_ratings1: List[List[int]], review_ratings2: List[List[int]]) -> float:
 
 
 
108
  review_count = len(review_ratings1) + len(review_ratings2)
109
  review_sum = sum([r[1] for r in review_ratings1]) + sum([r[1] for r in review_ratings2])
110
  return review_sum / review_count
111
 
112
 
113
  def get_review_ids_to_drop(review_ratings: List[List[int]], target_review_count: int, target_avg_rating: float) -> List[int]:
 
 
 
 
 
 
 
114
  ids_to_drop = []
115
  rated_lower_than_target = [r for r in review_ratings if r[1] <= target_avg_rating]
116
  rated_higher_than_target = [r for r in review_ratings if r[1] > target_avg_rating]
@@ -128,7 +159,10 @@ def get_review_ids_to_drop(review_ratings: List[List[int]], target_review_count:
128
  return ids_to_drop
129
 
130
 
131
- def drop_reviews(review_ids: list[int]):
 
 
 
132
  if len(review_ids) == 0:
133
  print("Warning - got zero reviews to drop")
134
  else:
@@ -141,6 +175,10 @@ def drop_reviews_to_balance_avg_rating(min_review_count: int = 5,
141
  max_review_count: int = 30,
142
  min_target_avg_rating: float = 3.4,
143
  max_target_avg_rating: float = 4.9):
 
 
 
 
144
  sql = "select id, name from products"
145
  product_id_names = [(r[0], r[1]) for r in execute_select(sql)]
146
  for prod_id, prod_name in product_id_names:
 
32
 
33
 
34
  def copy_all_products_db() -> None:
35
+ """
36
+ Copies the source to the target database via a simple file copy.
37
+ """
38
  shutil.copy(src_db(), dst_db())
39
 
40
 
 
45
  return ', '.join([f'"{i}"' for i in items])
46
 
47
 
48
+ def execute_sqls(sql_statements: List[str]) -> None:
49
+ """
50
+ Executes a number of sql statements against the destination DB
51
+ """
52
  db_connection = dst_conn()
53
  db_cursor = db_connection.cursor()
54
  for s in sql_statements:
 
57
 
58
 
59
  def execute_select(sql_statement: str) -> List[List]:
60
+ """
61
+ Executes a single sql statement against the destination DB and returns the
62
+ complete associated resultset
63
+ """
64
  db_connection = dst_conn()
65
  db_cursor = db_connection.cursor()
66
  db_cursor.execute(sql_statement)
 
68
 
69
 
70
  def drop_categories(keeping: List[str]) -> None:
71
+ """
72
+ Drops whole categories from the destination DB. Only keeps the ones named in the list.
73
+ Drops from tables seuqentially to respect foreign key constraints.
74
+ """
75
  keep_in = f"({join_strings_for_sql(keeping)})"
76
  sql_statements = []
77
  sql = f"delete from reviews where product_id in (select id from products where category_id in (select id from categories where name not in {keep_in}))"
 
89
 
90
 
91
  def drop_products(ids: List[int]) -> None:
92
+ """
93
+ Drops a number of products from the destination DB based on IDs. Drops from tables sequentially to respect foreign
94
+ key constraints
95
+ :param ids: the IDs of the products to drop
96
+ """
97
  ids_in = f"({join_strings_for_sql(ids)})"
98
 
99
  sql_statements = []
 
106
  execute_sqls(sql_statements)
107
 
108
 
109
+ def winnow_prods_per_category(min_count: int = 10, max_count: int = 25) -> None:
110
+ """
111
+ Thins out the number of products in each category so the number falls somewhere (randomly) within a target range
112
+ """
113
  sql = "select name from categories"
114
  categories = [r[0] for r in execute_select(sql)]
115
  for c in categories:
 
126
 
127
 
128
  def avg_rating(review_ratings1: List[List[int]], review_ratings2: List[List[int]]) -> float:
129
+ """
130
+ Calculates the average rating across two lists of reviews
131
+ """
132
  review_count = len(review_ratings1) + len(review_ratings2)
133
  review_sum = sum([r[1] for r in review_ratings1]) + sum([r[1] for r in review_ratings2])
134
  return review_sum / review_count
135
 
136
 
137
  def get_review_ids_to_drop(review_ratings: List[List[int]], target_review_count: int, target_avg_rating: float) -> List[int]:
138
+ """
139
+ Selects a set of reviews to drop from the dataset whilst trying to meet two objectives. First is to get close to
140
+ a target average review value (i.e. review stars for this product). Second is to do this while arriving at a
141
+ requested number of overall reviews. The approach is to split the reviews to higher than target / lower than target
142
+ and then repeatedly randomly select either a higher than target or lower than target individual review to
143
+ move the average in the right direction and get an approximately good solution.
144
+ """
145
  ids_to_drop = []
146
  rated_lower_than_target = [r for r in review_ratings if r[1] <= target_avg_rating]
147
  rated_higher_than_target = [r for r in review_ratings if r[1] > target_avg_rating]
 
159
  return ids_to_drop
160
 
161
 
162
+ def drop_reviews(review_ids: list[int]) -> None:
163
+ """
164
+ Drops specific review IDs from the destination DB.
165
+ """
166
  if len(review_ids) == 0:
167
  print("Warning - got zero reviews to drop")
168
  else:
 
175
  max_review_count: int = 30,
176
  min_target_avg_rating: float = 3.4,
177
  max_target_avg_rating: float = 4.9):
178
+ """
179
+ For each product generates a target rating in a range and a target number of reviews in a range
180
+ and then runs a process to try and get close to a solution for that product.
181
+ """
182
  sql = "select id, name from products"
183
  product_id_names = [(r[0], r[1]) for r in execute_select(sql)]
184
  for prod_id, prod_name in product_id_names: