File size: 4,058 Bytes
a19a983
 
 
 
 
 
 
 
 
 
 
 
 
 
8a677b0
a19a983
 
 
 
b5d446f
 
 
a19a983
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5d446f
a19a983
b5d446f
 
 
a19a983
 
 
 
b5d446f
a19a983
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5d446f
a19a983
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
This script contains the second step of data synthesis - loading all of the data which was generated
in the json files into an sqlite rdbms structure for further analysis and usage.  This results in the
exhaustive dataset - not a trimmed down dataset for a particular usage.
"""

import os
import sqlite3

from src.common import data_dir
from src.data_synthesis.generate_data import get_categories_and_features, products_for_category


def db_file() -> str:
    output_file_name = f"01_all_products_dataset.db"
    return os.path.join(data_dir, 'sqlite', output_file_name)


def setup_db_tables() -> None:
    """
    Drop all the tables in the database and then re-build the structure empty
    """
    con = sqlite3.connect(db_file())
    tables = ['reviews', 'product_features', 'features', 'products', 'categories']
    for t in tables:
        con.execute(f'DROP TABLE IF EXISTS {t}')

    # Create categories table
    sql = "CREATE TABLE categories (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);"
    con.execute(sql)

    # Create features
    sql = "CREATE TABLE features (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))"
    con.execute(sql)

    # Create products
    sql = "CREATE TABLE products (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, description TEXT NOT NULL, price REAL NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))"
    con.execute(sql)

    # Create product / feature link
    sql = "CREATE TABLE product_features (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, feature_id INTEGER NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id), FOREIGN KEY (feature_id) REFERENCES features (id))"
    con.execute(sql)

    # Create reviews
    sql = "CREATE TABLE reviews (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, rating INTEGER NOT NULL, review_text TEXT NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id))"
    con.execute(sql)


def insert_data() -> None:
    """
    Insert the data from the json data files into the data structure
    """
    con = sqlite3.connect(db_file())
    cur = con.cursor()

    cats_and_features = get_categories_and_features()

    for cat, features in cats_and_features.items():
        sql = f"INSERT INTO categories('name') VALUES ('{cat}')"
        cat_id = con.execute(sql).lastrowid
        con.commit()
        sql = f"INSERT INTO features('name', 'category_id') VALUES "
        values = [f"('{f}', {cat_id})" for f in features]
        sql += ', '.join(values)
        con.execute(sql)
        con.commit()

        for prod in products_for_category(cat):
            sql = f"INSERT INTO products('name', 'description', 'price', 'category_id') VALUES (?, ?, ?, ?)"
            cur.execute(sql, (prod.name, prod.description, prod.price, cat_id))
            prod_id = cur.lastrowid
            con.commit()

            for feat in prod.features:
                sql = f"SELECT id from features WHERE lower(name)='{feat.lower()}' AND category_id={cat_id}"
                cur.execute(sql)
                rows = cur.fetchall()
                if len(rows) == 0:
                    print(f"Feature {feat} not found in category {cat} but used for {prod.name}")
                    sql = f"INSERT INTO features('name', 'category_id') VALUES (?, ?)"
                    cur.execute(sql, (feat, cat_id))
                    feat_id = cur.lastrowid
                else:
                    feat_id = rows[0][0]
                sql = f"INSERT INTO product_features('product_id', 'feature_id') VALUES ({prod_id}, {feat_id})"
                con.execute(sql)

            for review in prod.reviews:
                sql = f"INSERT INTO reviews('product_id', 'rating', 'review_text') VALUES (?, ?, ?)"
                cur.execute(sql, (prod_id, review.stars, review.review_text))
                con.commit()


if __name__ == "__main__":
    setup_db_tables()
    insert_data()