AI-Manith commited on
Commit
86b447d
·
verified ·
1 Parent(s): 1e61b24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -16
app.py CHANGED
@@ -8,6 +8,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.neighbors import NearestNeighbors
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
 
 
11
 
12
  # Set page configuration
13
  st.set_page_config(
@@ -21,33 +23,73 @@ GITHUB_CSV_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngin
21
  GITHUB_KNN_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/knn_model.pkl"
22
  GITHUB_TFIDF_URL = "https://raw.githubusercontent.com/Manithj/bookRecEngine/main/tfidf_vectorizer.pkl"
23
 
 
 
 
 
 
 
 
 
 
 
 
24
  # Define the preprocessing function
25
  def preprocess_text(text):
26
  return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
27
 
28
- # Load models from GitHub - using st.cache_resource to load only once
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  @st.cache_resource
30
- def load_models_from_github():
31
  try:
32
- # Load TF-IDF vectorizer
33
- tfidf_response = requests.get(GITHUB_TFIDF_URL)
34
- tfidf = pickle.loads(tfidf_response.content)
 
 
 
35
 
36
- # Load KNN model
37
- knn_response = requests.get(GITHUB_KNN_URL)
38
- knn_model = pickle.loads(knn_response.content)
 
 
 
39
 
40
  return tfidf, knn_model
41
  except Exception as e:
42
  st.error(f"Error loading models: {e}")
43
  return None, None
44
 
45
- # Load the dataset from GitHub - using st.cache_data to load only once
46
  @st.cache_data
47
- def load_data_from_github():
48
  try:
49
- # Load CSV directly using Polars
50
- df_cleaned = pl.read_csv(GITHUB_CSV_URL)
 
 
 
 
 
 
51
 
52
  # Clean and prepare the data
53
  df_cleaned = df_cleaned.drop_nulls(subset=['name', 'summary', 'genres'])
@@ -69,8 +111,8 @@ def load_data_from_github():
69
 
70
  # Load models and data at startup - this happens only once due to caching
71
  with st.spinner("Loading models and data (this will only happen once)..."):
72
- tfidf, knn_model = load_models_from_github()
73
- df_cleaned = load_data_from_github()
74
 
75
  if tfidf is not None and knn_model is not None and df_cleaned is not None:
76
  models_loaded = True
@@ -82,7 +124,7 @@ st.title("📚 Book Recommendation System")
82
  st.markdown("Enter a book summary and genres to get personalized book recommendations!")
83
 
84
  if not models_loaded:
85
- st.error("Failed to load models or data. Please check the GitHub URLs.")
86
  else:
87
  st.success("Models and data loaded successfully!")
88
 
@@ -184,7 +226,7 @@ st.sidebar.info(
184
  The recommendations are based on textual similarity between your input and
185
  our database of books from Goodreads.
186
 
187
- Models and data are loaded directly from GitHub.
188
  """
189
  )
190
 
 
8
  from sklearn.neighbors import NearestNeighbors
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
+ import os
12
+ import time
13
 
14
  # Set page configuration
15
  st.set_page_config(
 
23
  GITHUB_KNN_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/knn_model.pkl"
24
  GITHUB_TFIDF_URL = "https://raw.githubusercontent.com/Manithj/bookRecEngine/main/tfidf_vectorizer.pkl"
25
 
26
+ # Local file paths for saved models and dataset
27
+ MODEL_DIR = "models"
28
+ DATA_DIR = "data"
29
+ KNN_PATH = os.path.join(MODEL_DIR, "knn_model.pkl")
30
+ TFIDF_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl")
31
+ CSV_PATH = os.path.join(DATA_DIR, "goodreadsV2.csv")
32
+
33
+ # Create directories if they don't exist
34
+ os.makedirs(MODEL_DIR, exist_ok=True)
35
+ os.makedirs(DATA_DIR, exist_ok=True)
36
+
37
  # Define the preprocessing function
38
  def preprocess_text(text):
39
  return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
40
 
41
+ # Download and save files if they don't exist locally
42
+ def download_and_save_file(url, save_path, is_binary=True):
43
+ if not os.path.exists(save_path):
44
+ with st.spinner(f"Downloading {os.path.basename(save_path)}..."):
45
+ response = requests.get(url)
46
+ if response.status_code == 200:
47
+ mode = "wb" if is_binary else "w"
48
+ with open(save_path, mode) as f:
49
+ f.write(response.content)
50
+ st.success(f"Downloaded {os.path.basename(save_path)}")
51
+ # Add a small delay to ensure file is completely written
52
+ time.sleep(1)
53
+ else:
54
+ st.error(f"Failed to download from {url}, status code: {response.status_code}")
55
+ return False
56
+ return True
57
+
58
+ # Load models from local storage or download if needed
59
  @st.cache_resource
60
+ def load_models():
61
  try:
62
+ # Download models if they don't exist locally
63
+ tfidf_downloaded = download_and_save_file(GITHUB_TFIDF_URL, TFIDF_PATH)
64
+ knn_downloaded = download_and_save_file(GITHUB_KNN_URL, KNN_PATH)
65
+
66
+ if not (tfidf_downloaded and knn_downloaded):
67
+ return None, None
68
 
69
+ # Load models from local storage
70
+ with open(TFIDF_PATH, 'rb') as f:
71
+ tfidf = pickle.load(f)
72
+
73
+ with open(KNN_PATH, 'rb') as f:
74
+ knn_model = pickle.load(f)
75
 
76
  return tfidf, knn_model
77
  except Exception as e:
78
  st.error(f"Error loading models: {e}")
79
  return None, None
80
 
81
+ # Load the dataset from local storage or download if needed
82
  @st.cache_data
83
+ def load_data():
84
  try:
85
+ # Download dataset if it doesn't exist locally
86
+ csv_downloaded = download_and_save_file(GITHUB_CSV_URL, CSV_PATH, is_binary=True)
87
+
88
+ if not csv_downloaded:
89
+ return None
90
+
91
+ # Load CSV from local storage
92
+ df_cleaned = pl.read_csv(CSV_PATH)
93
 
94
  # Clean and prepare the data
95
  df_cleaned = df_cleaned.drop_nulls(subset=['name', 'summary', 'genres'])
 
111
 
112
  # Load models and data at startup - this happens only once due to caching
113
  with st.spinner("Loading models and data (this will only happen once)..."):
114
+ tfidf, knn_model = load_models()
115
+ df_cleaned = load_data()
116
 
117
  if tfidf is not None and knn_model is not None and df_cleaned is not None:
118
  models_loaded = True
 
124
  st.markdown("Enter a book summary and genres to get personalized book recommendations!")
125
 
126
  if not models_loaded:
127
+ st.error("Failed to load models or data. Please check the file paths and URLs.")
128
  else:
129
  st.success("Models and data loaded successfully!")
130
 
 
226
  The recommendations are based on textual similarity between your input and
227
  our database of books from Goodreads.
228
 
229
+ Models and data are stored locally on the server after initial download.
230
  """
231
  )
232