""" This script contains the second step of data synthesis - loading all of the data which was generated in the json files into an sqlite rdbms structure for further analysis and usage. This results in the exhaustive dataset - not a trimmed down dataset for a particular usage. """ import os import sqlite3 from src.common import data_dir from src.data_synthesis.generate_data import get_categories_and_features, products_for_category def db_file() -> str: output_file_name = f"01_all_products_dataset.db" return os.path.join(data_dir, 'sqlite', output_file_name) def setup_db_tables() -> None: """ Drop all the tables in the database and then re-build the structure empty """ con = sqlite3.connect(db_file()) tables = ['reviews', 'product_features', 'features', 'products', 'categories'] for t in tables: con.execute(f'DROP TABLE IF EXISTS {t}') # Create categories table sql = "CREATE TABLE categories (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);" con.execute(sql) # Create features sql = "CREATE TABLE features (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))" con.execute(sql) # Create products sql = "CREATE TABLE products (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, description TEXT NOT NULL, price REAL NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))" con.execute(sql) # Create product / feature link sql = "CREATE TABLE product_features (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, feature_id INTEGER NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id), FOREIGN KEY (feature_id) REFERENCES features (id))" con.execute(sql) # Create reviews sql = "CREATE TABLE reviews (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, rating INTEGER NOT NULL, review_text TEXT NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id))" con.execute(sql) def insert_data() -> None: """ Insert the data from the json data files into the data structure """ con = sqlite3.connect(db_file()) cur = con.cursor() cats_and_features = get_categories_and_features() for cat, features in cats_and_features.items(): sql = f"INSERT INTO categories('name') VALUES ('{cat}')" cat_id = con.execute(sql).lastrowid con.commit() sql = f"INSERT INTO features('name', 'category_id') VALUES " values = [f"('{f}', {cat_id})" for f in features] sql += ', '.join(values) con.execute(sql) con.commit() for prod in products_for_category(cat): sql = f"INSERT INTO products('name', 'description', 'price', 'category_id') VALUES (?, ?, ?, ?)" cur.execute(sql, (prod.name, prod.description, prod.price, cat_id)) prod_id = cur.lastrowid con.commit() for feat in prod.features: sql = f"SELECT id from features WHERE lower(name)='{feat.lower()}' AND category_id={cat_id}" cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: print(f"Feature {feat} not found in category {cat} but used for {prod.name}") sql = f"INSERT INTO features('name', 'category_id') VALUES (?, ?)" cur.execute(sql, (feat, cat_id)) feat_id = cur.lastrowid else: feat_id = rows[0][0] sql = f"INSERT INTO product_features('product_id', 'feature_id') VALUES ({prod_id}, {feat_id})" con.execute(sql) for review in prod.reviews: sql = f"INSERT INTO reviews('product_id', 'rating', 'review_text') VALUES (?, ?, ?)" cur.execute(sql, (prod_id, review.stars, review.review_text)) con.commit() if __name__ == "__main__": setup_db_tables() insert_data()