llm-arch / src /data_synthesis /data_loader.py
alfraser's picture
Added the scripts which were used to build the dataset to the repo, and tweaked to use common code
a19a983
raw
history blame
3.86 kB
"""
This script contains the second step of data synthesis - loading all of the data which was generated
in the json files into an sqlite rdbms structure for further analysis and usage. This results in the
exhaustive dataset - not a trimmed down dataset for a particular usage.
"""
import os
import sqlite3
from src.common import data_dir
from src.data_synthesis.generate_data import get_categories_and_features, products_for_category
def db_file() -> str:
output_file_name = f"products_dataset.db"
return os.path.join(data_dir, 'sqlite', output_file_name)
def setup_db_tables() -> None:
con = sqlite3.connect(db_file())
tables = ['reviews', 'product_features', 'features', 'products', 'categories']
for t in tables:
con.execute(f'DROP TABLE IF EXISTS {t}')
# Create categories table
sql = "CREATE TABLE categories (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);"
con.execute(sql)
# Create features
sql = "CREATE TABLE features (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))"
con.execute(sql)
# Create products
sql = "CREATE TABLE products (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, description TEXT NOT NULL, price REAL NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))"
con.execute(sql)
# Create product / feature link
sql = "CREATE TABLE product_features (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, feature_id INTEGER NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id), FOREIGN KEY (feature_id) REFERENCES features (id))"
con.execute(sql)
# Create reviews
sql = "CREATE TABLE reviews (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, rating INTEGER NOT NULL, review_text TEXT NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id))"
con.execute(sql)
def insert_data() -> None:
con = sqlite3.connect(db_file())
cur = con.cursor()
cats_and_features = get_categories_and_features()
for cat, features in cats_and_features.items():
sql = f"INSERT INTO categories('name') VALUES ('{cat}')"
cat_id = con.execute(sql).lastrowid
con.commit()
sql = f"INSERT INTO features('name', 'category_id') VALUES "
values = [f"('{f}', {cat_id})" for f in features]
sql += ', '.join(values)
con.execute(sql)
con.commit()
for prod in products_for_category(cat):
sql = f"INSERT INTO products('name', 'description', 'price', 'category_id') VALUES (?, ?, ?, ?)"
cur.execute(sql, (prod.name, prod.description, prod.price, cat_id))
prod_id = cur.lastrowid
con.commit()
for feat in prod.features:
sql = f"SELECT id from features WHERE name='{feat}' AND category_id={cat_id}"
cur.execute(sql)
rows = cur.fetchall()
if len(rows) == 0:
print(f"Feature {feat} not found in category {cat} but used for {prod.name}")
sql = f"INSERT INTO features('name', 'category_id') VALUES (?, ?)"
cur.execute(sql, (feat, cat_id))
feat_id = cur.lastrowid
else:
feat_id = rows[0][0]
sql = f"INSERT INTO product_features('product_id', 'feature_id') VALUES ({prod_id}, {feat_id})"
con.execute(sql)
for review in prod.reviews:
sql = f"INSERT INTO reviews('product_id', 'rating', 'review_text') VALUES (?, ?, ?)"
cur.execute(sql, (prod_id, review.stars, review.review_text))
con.commit()
if __name__ == "__main__":
setup_db_tables()
insert_data()