Spaces:
Running
Running
Commit
·
e2cb6ae
1
Parent(s):
39440ed
Fix app and removed modules
Browse files- app.py +215 -10
- modules/__init__.py +0 -1
- modules/__pycache__/__init__.cpython-311.pyc +0 -0
- modules/__pycache__/arima.cpython-311.pyc +0 -0
- modules/__pycache__/preprocessor.cpython-311.pyc +0 -0
- modules/__pycache__/tapas.cpython-311.pyc +0 -0
- modules/arima.py +0 -68
- modules/preprocessor.py +0 -76
- modules/tapas.py +0 -43
app.py
CHANGED
|
@@ -1,6 +1,195 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
st.title("Sales Forecasting Dashboard")
|
| 6 |
st.write("📈 Welcome User, start using the application by uploading your file in the sidebbar!")
|
|
@@ -10,10 +199,10 @@ st.set_page_config(
|
|
| 10 |
page_icon="📈",
|
| 11 |
layout="wide",
|
| 12 |
initial_sidebar_state="expanded",
|
| 13 |
-
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
|
| 18 |
# Sidebar Menu
|
| 19 |
with st.sidebar:
|
|
@@ -28,11 +217,27 @@ with st.sidebar:
|
|
| 28 |
df = pd.read_csv(uploaded_file, parse_dates=True)
|
| 29 |
st.write("Your uploaded data:")
|
| 30 |
st.write(df)
|
|
|
|
| 31 |
# Data pre-processing
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 37 |
with open('sample.csv', 'rb') as f:
|
| 38 |
-
st.download_button("Download our sample CSV", f, file_name='sample.csv')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pmdarima as pm
|
| 7 |
+
from pmdarima import auto_arima
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
from transformers import pipeline, TapasTokenizer, TapasForQuestionAnswering
|
| 11 |
+
|
| 12 |
+
# Preprocessing
|
| 13 |
+
def merge(B, C, A):
|
| 14 |
+
i = j = k = 0
|
| 15 |
+
|
| 16 |
+
# Convert 'Date' columns to datetime.date objects
|
| 17 |
+
B['Date'] = pd.to_datetime(B['Date']).dt.date
|
| 18 |
+
C['Date'] = pd.to_datetime(C['Date']).dt.date
|
| 19 |
+
A['Date'] = pd.to_datetime(A['Date']).dt.date
|
| 20 |
+
|
| 21 |
+
while i < len(B) and j < len(C):
|
| 22 |
+
if B['Date'].iloc[i] <= C['Date'].iloc[j]:
|
| 23 |
+
A['Date'].iloc[k] = B['Date'].iloc[i]
|
| 24 |
+
A['Sales'].iloc[k] = B['Sales'].iloc[i]
|
| 25 |
+
i += 1
|
| 26 |
+
|
| 27 |
+
else:
|
| 28 |
+
A['Date'].iloc[k] = C['Date'].iloc[j]
|
| 29 |
+
A['Sales'].iloc[k] = C['Sales'].iloc[j]
|
| 30 |
+
j += 1
|
| 31 |
+
k += 1
|
| 32 |
+
|
| 33 |
+
while i < len(B):
|
| 34 |
+
A['Date'].iloc[k] = B['Date'].iloc[i]
|
| 35 |
+
A['Sales'].iloc[k] = B['Sales'].iloc[i]
|
| 36 |
+
i += 1
|
| 37 |
+
k += 1
|
| 38 |
+
|
| 39 |
+
while j < len(C):
|
| 40 |
+
A['Date'].iloc[k] = C['Date'].iloc[j]
|
| 41 |
+
A['Sales'].iloc[k] = C['Sales'].iloc[j]
|
| 42 |
+
j += 1
|
| 43 |
+
k += 1
|
| 44 |
+
|
| 45 |
+
return A
|
| 46 |
+
|
| 47 |
+
def merge_sort(dataframe):
|
| 48 |
+
if len(dataframe) > 1:
|
| 49 |
+
center = len(dataframe) // 2
|
| 50 |
+
left = dataframe.iloc[:center]
|
| 51 |
+
right = dataframe.iloc[center:]
|
| 52 |
+
merge_sort(left)
|
| 53 |
+
merge_sort(right)
|
| 54 |
+
|
| 55 |
+
return merge(left, right, dataframe)
|
| 56 |
+
|
| 57 |
+
else:
|
| 58 |
+
return dataframe
|
| 59 |
+
|
| 60 |
+
def drop (dataframe):
|
| 61 |
+
def get_columns_containing(dataframe, substrings):
|
| 62 |
+
return [col for col in dataframe.columns if any(substring.lower() in col.lower() for substring in substrings)]
|
| 63 |
+
|
| 64 |
+
columns_to_keep = get_columns_containing(dataframe, ["date", "sale"])
|
| 65 |
+
dataframe = dataframe.drop(columns=dataframe.columns.difference(columns_to_keep))
|
| 66 |
+
dataframe = dataframe.dropna()
|
| 67 |
+
|
| 68 |
+
return dataframe
|
| 69 |
+
|
| 70 |
+
def date_format(dataframe):
|
| 71 |
+
for i, d, s in dataframe.itertuples():
|
| 72 |
+
dataframe['Date'][i] = dataframe['Date'][i].strip()
|
| 73 |
+
|
| 74 |
+
for i, d, s in dataframe.itertuples():
|
| 75 |
+
new_date = datetime.strptime(dataframe['Date'][i], "%m/%d/%Y").date()
|
| 76 |
+
dataframe['Date'][i] = new_date
|
| 77 |
+
|
| 78 |
+
return dataframe
|
| 79 |
+
|
| 80 |
+
def group_to_three(dataframe):
|
| 81 |
+
dataframe['Date'] = pd.to_datetime(dataframe['Date'])
|
| 82 |
+
dataframe = dataframe.groupby([pd.Grouper(key='Date', freq='3D')])['Sales'].mean().round(2)
|
| 83 |
+
dataframe = dataframe.replace(0, pd.np.nan).dropna()
|
| 84 |
+
|
| 85 |
+
return dataframe
|
| 86 |
+
|
| 87 |
+
# SARIMAX Model
|
| 88 |
+
def train_test(dataframe, n):
|
| 89 |
+
training_y = dataframe.iloc[:-n,0]
|
| 90 |
+
test_y = dataframe.iloc[-n:,0]
|
| 91 |
+
test_y_series = pd.Series(test_y, index=dataframe.iloc[-n:, 0].index)
|
| 92 |
+
training_X = dataframe.iloc[:-n,1:]
|
| 93 |
+
test_X = dataframe.iloc[-n:,1:]
|
| 94 |
+
future_X = dataframe.iloc[0:,1:]
|
| 95 |
+
return (training_y, test_y, test_y_series, training_X, test_X, future_X)
|
| 96 |
+
|
| 97 |
+
def model_fitting(dataframe, Exo):
|
| 98 |
+
futureModel = pm.auto_arima(dataframe['Sales'], X=Exo, start_p=1, start_q=1,
|
| 99 |
+
test='adf',min_p=1,min_q=1,
|
| 100 |
+
max_p=3, max_q=3, m=12,
|
| 101 |
+
start_P=0, seasonal=True,
|
| 102 |
+
d=None, D=1, trace=True,
|
| 103 |
+
error_action='ignore',
|
| 104 |
+
suppress_warnings=True,
|
| 105 |
+
stepwise=True)
|
| 106 |
+
model = futureModel
|
| 107 |
+
return model
|
| 108 |
+
|
| 109 |
+
def test_fitting(dataframe, Exo, trainY):
|
| 110 |
+
trainTestModel = auto_arima(X = Exo, y = trainY, start_p=1, start_q=1,
|
| 111 |
+
test='adf',min_p=1,min_q=1,
|
| 112 |
+
max_p=3, max_q=3, m=12,
|
| 113 |
+
start_P=0, seasonal=True,
|
| 114 |
+
d=None, D=1, trace=True,
|
| 115 |
+
error_action='ignore',
|
| 116 |
+
suppress_warnings=True,
|
| 117 |
+
stepwise=True)
|
| 118 |
+
model = trainTestModel
|
| 119 |
+
return model
|
| 120 |
+
|
| 121 |
+
def forecast_accuracy(forecast, actual):
|
| 122 |
+
mape = np.mean(np.abs(forecast - actual)/np.abs(actual)).round(4) # MAPE
|
| 123 |
+
rmse = (np.mean((forecast - actual)**2)**.5).round(2) # RMSE
|
| 124 |
+
corr = np.corrcoef(forecast, actual)[0,1] # corr
|
| 125 |
+
mins = np.amin(np.hstack([forecast[:,None],
|
| 126 |
+
actual[:,None]]), axis=1)
|
| 127 |
+
maxs = np.amax(np.hstack([forecast[:,None],
|
| 128 |
+
actual[:,None]]), axis=1)
|
| 129 |
+
minmax = 1 - np.mean(mins/maxs) # minmax
|
| 130 |
+
return({'mape':mape, 'rmse':rmse, 'corr':corr, 'min-max':minmax})
|
| 131 |
+
|
| 132 |
+
def sales_growth(dataframe, fittedValues):
|
| 133 |
+
sales_growth = fittedValues.to_frame()
|
| 134 |
+
sales_growth = sales_growth.reset_index()
|
| 135 |
+
sales_growth.columns = ("Date", "Sales")
|
| 136 |
+
sales_growth = sales_growth.set_index('Date')
|
| 137 |
+
|
| 138 |
+
sales_growth['Sales'] = (sales_growth['Sales']).round(2)
|
| 139 |
+
|
| 140 |
+
#Calculate and create the column for sales difference and growth
|
| 141 |
+
sales_growth['Forecasted Sales First Difference']=(sales_growth['Sales']-sales_growth['Sales'].shift(1)).round(2)
|
| 142 |
+
sales_growth['Forecasted Sales Growth']=(((sales_growth['Sales']-sales_growth['Sales'].shift(1))/sales_growth['Sales'].shift(1))*100).round(2)
|
| 143 |
+
|
| 144 |
+
#Calculate and create the first row for sales difference and growth
|
| 145 |
+
sales_growth['Forecasted Sales First Difference'].iloc[0] = (dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2]).round(2)
|
| 146 |
+
sales_growth['Forecasted Sales Growth'].iloc[0]=(((dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2])/dataframe['Sales'].iloc[-1])*100).round(2)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
return sales_growth
|
| 150 |
+
|
| 151 |
+
# TAPAS Model
|
| 152 |
+
model_name = "google/tapas-large-finetuned-wtq"
|
| 153 |
+
@st.cache
|
| 154 |
+
def load_tapas_model(model_name):
|
| 155 |
+
tokenizer = TapasTokenizer.from_pretrained(model_name)
|
| 156 |
+
model = TapasForQuestionAnswering.from_pretrained(model_name, local_files_only=False)
|
| 157 |
+
pipe = pipeline("table-question-answering", model=model, tokenizer=tokenizer)
|
| 158 |
+
return pipe
|
| 159 |
+
|
| 160 |
+
pipe = load_tapas_model(model_name)
|
| 161 |
+
|
| 162 |
+
def get_answer(table, query):
|
| 163 |
+
answers = pipe(table=table, query=query)
|
| 164 |
+
print(answers['coordinates']) # FOR DEBUGGING PURPOSES
|
| 165 |
+
return answers
|
| 166 |
+
|
| 167 |
+
def convert_answer(answer):
|
| 168 |
+
if answer['aggregator'] == 'SUM':
|
| 169 |
+
print(answer['answer']) # FOR DEBUGGING
|
| 170 |
+
cells = answer['cells']
|
| 171 |
+
converted = sum(float(value.replace(',', '')) for value in cells)
|
| 172 |
+
return converted
|
| 173 |
+
|
| 174 |
+
if answer['aggregator'] == 'AVERAGE':
|
| 175 |
+
print(answer['answer']) # FOR DEBUGGING
|
| 176 |
+
cells = answer['cells']
|
| 177 |
+
values = [float(value.replace(',', '')) for value in cells]
|
| 178 |
+
converted = sum(values) / len(values)
|
| 179 |
+
return converted
|
| 180 |
+
|
| 181 |
+
if answer['aggregator'] == 'COUNT':
|
| 182 |
+
print(answer['answer']) # FOR DEBUGGING
|
| 183 |
+
cells = answer['cells']
|
| 184 |
+
converted = sum(int(value.replace(',', '')) for value in cells)
|
| 185 |
+
return converted
|
| 186 |
+
|
| 187 |
+
else:
|
| 188 |
+
return answer
|
| 189 |
+
|
| 190 |
+
def get_converted_answer(table, query):
|
| 191 |
+
converted_answer = convert_answer(get_answer(table, query))
|
| 192 |
+
return converted_answer
|
| 193 |
|
| 194 |
st.title("Sales Forecasting Dashboard")
|
| 195 |
st.write("📈 Welcome User, start using the application by uploading your file in the sidebbar!")
|
|
|
|
| 199 |
page_icon="📈",
|
| 200 |
layout="wide",
|
| 201 |
initial_sidebar_state="expanded",
|
| 202 |
+
)
|
| 203 |
|
| 204 |
+
if 'uploaded' not in st.session_state:
|
| 205 |
+
st.session_state.uploaded = 'uploaded'
|
| 206 |
|
| 207 |
# Sidebar Menu
|
| 208 |
with st.sidebar:
|
|
|
|
| 217 |
df = pd.read_csv(uploaded_file, parse_dates=True)
|
| 218 |
st.write("Your uploaded data:")
|
| 219 |
st.write(df)
|
| 220 |
+
|
| 221 |
# Data pre-processing
|
| 222 |
+
df = drop(df)
|
| 223 |
+
df = date_format(df)
|
| 224 |
+
merge_sort(df)
|
| 225 |
+
df = group_to_three(df)
|
| 226 |
+
st.session_state.uploaded = True
|
| 227 |
+
|
| 228 |
with open('sample.csv', 'rb') as f:
|
| 229 |
+
st.download_button("Download our sample CSV", f, file_name='sample.csv')
|
| 230 |
+
|
| 231 |
+
if (st.session_state.uploaded):
|
| 232 |
+
st.line_chart(df)
|
| 233 |
+
|
| 234 |
+
forecast_button_clicked = st.button(
|
| 235 |
+
'Start Forecasting',
|
| 236 |
+
key='forecast_button',
|
| 237 |
+
type="primary",
|
| 238 |
+
disabled=st.session_state.uploaded,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
if (forecast_button_clicked):
|
| 242 |
+
# TODO call arima here
|
| 243 |
+
pass
|
modules/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
__all__ = ["preprocessor", "arima", "tapas"]
|
|
|
|
|
|
modules/__pycache__/__init__.cpython-311.pyc
DELETED
|
Binary file (236 Bytes)
|
|
|
modules/__pycache__/arima.cpython-311.pyc
DELETED
|
Binary file (5.32 kB)
|
|
|
modules/__pycache__/preprocessor.cpython-311.pyc
DELETED
|
Binary file (5.09 kB)
|
|
|
modules/__pycache__/tapas.cpython-311.pyc
DELETED
|
Binary file (2.91 kB)
|
|
|
modules/arima.py
DELETED
|
@@ -1,68 +0,0 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
import pandas as pd
|
| 3 |
-
from datetime import datetime
|
| 4 |
-
import pmdarima as pm
|
| 5 |
-
from pmdarima import auto_arima
|
| 6 |
-
|
| 7 |
-
def train_test(dataframe, n):
|
| 8 |
-
training_y = dataframe.iloc[:-n,0]
|
| 9 |
-
test_y = dataframe.iloc[-n:,0]
|
| 10 |
-
test_y_series = pd.Series(test_y, index=dataframe.iloc[-n:, 0].index)
|
| 11 |
-
training_X = dataframe.iloc[:-n,1:]
|
| 12 |
-
test_X = dataframe.iloc[-n:,1:]
|
| 13 |
-
future_X = dataframe.iloc[0:,1:]
|
| 14 |
-
return (training_y, test_y, test_y_series, training_X, test_X, future_X)
|
| 15 |
-
|
| 16 |
-
def model_fitting(dataframe, Exo):
|
| 17 |
-
futureModel = pm.auto_arima(dataframe['Sales'], X=Exo, start_p=1, start_q=1,
|
| 18 |
-
test='adf',min_p=1,min_q=1,
|
| 19 |
-
max_p=3, max_q=3, m=12,
|
| 20 |
-
start_P=0, seasonal=True,
|
| 21 |
-
d=None, D=1, trace=True,
|
| 22 |
-
error_action='ignore',
|
| 23 |
-
suppress_warnings=True,
|
| 24 |
-
stepwise=True)
|
| 25 |
-
model = futureModel
|
| 26 |
-
return model
|
| 27 |
-
|
| 28 |
-
def test_fitting(dataframe, Exo, trainY):
|
| 29 |
-
trainTestModel = auto_arima(X = Exo, y = trainY, start_p=1, start_q=1,
|
| 30 |
-
test='adf',min_p=1,min_q=1,
|
| 31 |
-
max_p=3, max_q=3, m=12,
|
| 32 |
-
start_P=0, seasonal=True,
|
| 33 |
-
d=None, D=1, trace=True,
|
| 34 |
-
error_action='ignore',
|
| 35 |
-
suppress_warnings=True,
|
| 36 |
-
stepwise=True)
|
| 37 |
-
model = trainTestModel
|
| 38 |
-
return model
|
| 39 |
-
|
| 40 |
-
def forecast_accuracy(forecast, actual):
|
| 41 |
-
mape = np.mean(np.abs(forecast - actual)/np.abs(actual)).round(4) # MAPE
|
| 42 |
-
rmse = (np.mean((forecast - actual)**2)**.5).round(2) # RMSE
|
| 43 |
-
corr = np.corrcoef(forecast, actual)[0,1] # corr
|
| 44 |
-
mins = np.amin(np.hstack([forecast[:,None],
|
| 45 |
-
actual[:,None]]), axis=1)
|
| 46 |
-
maxs = np.amax(np.hstack([forecast[:,None],
|
| 47 |
-
actual[:,None]]), axis=1)
|
| 48 |
-
minmax = 1 - np.mean(mins/maxs) # minmax
|
| 49 |
-
return({'mape':mape, 'rmse':rmse, 'corr':corr, 'min-max':minmax})
|
| 50 |
-
|
| 51 |
-
def sales_growth(dataframe, fittedValues):
|
| 52 |
-
sales_growth = fittedValues.to_frame()
|
| 53 |
-
sales_growth = sales_growth.reset_index()
|
| 54 |
-
sales_growth.columns = ("Date", "Sales")
|
| 55 |
-
sales_growth = sales_growth.set_index('Date')
|
| 56 |
-
|
| 57 |
-
sales_growth['Sales'] = (sales_growth['Sales']).round(2)
|
| 58 |
-
|
| 59 |
-
#Calculate and create the column for sales difference and growth
|
| 60 |
-
sales_growth['Forecasted Sales First Difference']=(sales_growth['Sales']-sales_growth['Sales'].shift(1)).round(2)
|
| 61 |
-
sales_growth['Forecasted Sales Growth']=(((sales_growth['Sales']-sales_growth['Sales'].shift(1))/sales_growth['Sales'].shift(1))*100).round(2)
|
| 62 |
-
|
| 63 |
-
#Calculate and create the first row for sales difference and growth
|
| 64 |
-
sales_growth['Forecasted Sales First Difference'].iloc[0] = (dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2]).round(2)
|
| 65 |
-
sales_growth['Forecasted Sales Growth'].iloc[0]=(((dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2])/dataframe['Sales'].iloc[-1])*100).round(2)
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
return sales_growth
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modules/preprocessor.py
DELETED
|
@@ -1,76 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
from datetime import datetime
|
| 3 |
-
|
| 4 |
-
def merge(B, C, A):
|
| 5 |
-
i = j = k = 0
|
| 6 |
-
|
| 7 |
-
# Convert 'Date' columns to datetime.date objects
|
| 8 |
-
B['Date'] = pd.to_datetime(B['Date']).dt.date
|
| 9 |
-
C['Date'] = pd.to_datetime(C['Date']).dt.date
|
| 10 |
-
A['Date'] = pd.to_datetime(A['Date']).dt.date
|
| 11 |
-
|
| 12 |
-
while i < len(B) and j < len(C):
|
| 13 |
-
if B['Date'].iloc[i] <= C['Date'].iloc[j]:
|
| 14 |
-
A['Date'].iloc[k] = B['Date'].iloc[i]
|
| 15 |
-
A['Sales'].iloc[k] = B['Sales'].iloc[i]
|
| 16 |
-
i += 1
|
| 17 |
-
|
| 18 |
-
else:
|
| 19 |
-
A['Date'].iloc[k] = C['Date'].iloc[j]
|
| 20 |
-
A['Sales'].iloc[k] = C['Sales'].iloc[j]
|
| 21 |
-
j += 1
|
| 22 |
-
k += 1
|
| 23 |
-
|
| 24 |
-
while i < len(B):
|
| 25 |
-
A['Date'].iloc[k] = B['Date'].iloc[i]
|
| 26 |
-
A['Sales'].iloc[k] = B['Sales'].iloc[i]
|
| 27 |
-
i += 1
|
| 28 |
-
k += 1
|
| 29 |
-
|
| 30 |
-
while j < len(C):
|
| 31 |
-
A['Date'].iloc[k] = C['Date'].iloc[j]
|
| 32 |
-
A['Sales'].iloc[k] = C['Sales'].iloc[j]
|
| 33 |
-
j += 1
|
| 34 |
-
k += 1
|
| 35 |
-
|
| 36 |
-
return A
|
| 37 |
-
|
| 38 |
-
def merge_sort(dataframe):
|
| 39 |
-
if len(dataframe) > 1:
|
| 40 |
-
center = len(dataframe) // 2
|
| 41 |
-
left = dataframe.iloc[:center]
|
| 42 |
-
right = dataframe.iloc[center:]
|
| 43 |
-
merge_sort(left)
|
| 44 |
-
merge_sort(right)
|
| 45 |
-
|
| 46 |
-
return merge(left, right, dataframe)
|
| 47 |
-
|
| 48 |
-
else:
|
| 49 |
-
return dataframe
|
| 50 |
-
|
| 51 |
-
def drop (dataframe):
|
| 52 |
-
def get_columns_containing(dataframe, substrings):
|
| 53 |
-
return [col for col in dataframe.columns if any(substring.lower() in col.lower() for substring in substrings)]
|
| 54 |
-
|
| 55 |
-
columns_to_keep = get_columns_containing(dataframe, ["date", "sale"])
|
| 56 |
-
dataframe = dataframe.drop(columns=dataframe.columns.difference(columns_to_keep))
|
| 57 |
-
dataframe = dataframe.dropna()
|
| 58 |
-
|
| 59 |
-
return dataframe
|
| 60 |
-
|
| 61 |
-
def date_format(dataframe):
|
| 62 |
-
for i, d, s in dataframe.itertuples():
|
| 63 |
-
dataframe['Date'][i] = dataframe['Date'][i].strip()
|
| 64 |
-
|
| 65 |
-
for i, d, s in dataframe.itertuples():
|
| 66 |
-
new_date = datetime.strptime(dataframe['Date'][i], "%m/%d/%Y").date()
|
| 67 |
-
dataframe['Date'][i] = new_date
|
| 68 |
-
|
| 69 |
-
return dataframe
|
| 70 |
-
|
| 71 |
-
def group_to_three(dataframe):
|
| 72 |
-
dataframe['Date'] = pd.to_datetime(dataframe['Date'])
|
| 73 |
-
dataframe = dataframe.groupby([pd.Grouper(key='Date', freq='3D')])['Sales'].mean().round(2)
|
| 74 |
-
dataframe = dataframe.replace(0, pd.np.nan).dropna()
|
| 75 |
-
|
| 76 |
-
return dataframe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modules/tapas.py
DELETED
|
@@ -1,43 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
from transformers import pipeline, TapasTokenizer, TapasForQuestionAnswering
|
| 3 |
-
|
| 4 |
-
model_name = "google/tapas-large-finetuned-wtq"
|
| 5 |
-
|
| 6 |
-
# load the tokenizer and the model from huggingface model hub
|
| 7 |
-
tokenizer = TapasTokenizer.from_pretrained(model_name)
|
| 8 |
-
model = TapasForQuestionAnswering.from_pretrained(model_name, local_files_only=False)
|
| 9 |
-
|
| 10 |
-
# load the model and tokenizer into a question-answering pipeline
|
| 11 |
-
pipe = pipeline("table-question-answering", model=model, tokenizer=tokenizer)
|
| 12 |
-
|
| 13 |
-
def get_answer(table, query):
|
| 14 |
-
answers = pipe(table=table, query=query)
|
| 15 |
-
print(answers['coordinates']) # FOR DEBUGGING PURPOSES
|
| 16 |
-
return answers
|
| 17 |
-
|
| 18 |
-
def convert_answer(answer):
|
| 19 |
-
if answer['aggregator'] == 'SUM':
|
| 20 |
-
print(answer['answer']) # FOR DEBUGGING
|
| 21 |
-
cells = answer['cells']
|
| 22 |
-
converted = sum(float(value.replace(',', '')) for value in cells)
|
| 23 |
-
return converted
|
| 24 |
-
|
| 25 |
-
if answer['aggregator'] == 'AVERAGE':
|
| 26 |
-
print(answer['answer']) # FOR DEBUGGING
|
| 27 |
-
cells = answer['cells']
|
| 28 |
-
values = [float(value.replace(',', '')) for value in cells]
|
| 29 |
-
converted = sum(values) / len(values)
|
| 30 |
-
return converted
|
| 31 |
-
|
| 32 |
-
if answer['aggregator'] == 'COUNT':
|
| 33 |
-
print(answer['answer']) # FOR DEBUGGING
|
| 34 |
-
cells = answer['cells']
|
| 35 |
-
converted = sum(int(value.replace(',', '')) for value in cells)
|
| 36 |
-
return converted
|
| 37 |
-
|
| 38 |
-
else:
|
| 39 |
-
return answer
|
| 40 |
-
|
| 41 |
-
def get_converted_answer(table, query):
|
| 42 |
-
converted_answer = convert_answer(get_answer(table, query))
|
| 43 |
-
return converted_answer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|