frankjosh commited on
Commit
5d9493d
·
verified ·
1 Parent(s): 9a1517e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -418
app.py CHANGED
@@ -1,439 +1,89 @@
1
- # -*- coding: utf-8 -*-
2
- """repository_recommender.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1qv09N8Vtcw5vr5NqCSfZonFeh1SQmVW5
8
- """
9
-
10
- #!pip install pyarrow pandas numpy streamlit gdown torch transformers
11
-
12
- import warnings
13
- warnings.filterwarnings('ignore')
14
-
15
  import streamlit as st
16
  import pandas as pd
17
  import numpy as np
18
  from sklearn.metrics.pairwise import cosine_similarity
19
  from transformers import AutoTokenizer, AutoModel
20
  import torch
21
- import gdown
22
- from pathlib import Path
23
- from datetime import datetime
24
- import json
25
- from huggingface import hf_hub_download
26
-
27
- # Initialize session state for history and feedback
28
- if 'search_history' not in st.session_state:
29
- st.session_state.search_history = []
30
- if 'feedback_data' not in st.session_state:
31
- st.session_state.feedback_data = {}
32
-
33
- # Model Loading Optimization
34
- class ModelManager:
35
- def __init__(self):
36
- self.model = None
37
- self.tokenizer = None
38
- self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
39
-
40
- @st.cache_resource
41
- def load_model_and_tokenizer(self):
42
- """Optimized model loading with device placement"""
43
- model_name = "Salesforce/codet5-small"
44
- tokenizer = AutoTokenizer.from_pretrained(model_name)
45
- model = AutoModel.from_pretrained(model_name).to(self.device)
46
- model.eval() # Set model to evaluation mode
47
- return tokenizer, model
48
-
49
- def get_model_and_tokenizer(self):
50
- if self.model is None or self.tokenizer is None:
51
- self.tokenizer, self.model = self.load_model_and_tokenizer()
52
- return self.tokenizer, self.model
53
-
54
- @torch.no_grad() # Disable gradient computation
55
- def generate_embedding(self, text, max_length=512):
56
- """Optimized embedding generation"""
57
- tokenizer, model = self.get_model_and_tokenizer()
58
- inputs = tokenizer(
59
- text,
60
- return_tensors="pt",
61
- padding=True,
62
- truncation=True,
63
- max_length=max_length
64
- ).to(self.device)
65
-
66
- outputs = model.encoder(**inputs)
67
- embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
68
- return embedding
69
-
70
- # Data Management
71
- class DataManager:
72
- @st.cache_resource
73
- def load_dataset():
74
- """Load and prepare dataset"""
75
-
76
- Path("data").mkdir(exist_ok=True)
77
- dataset_path = "https://drive.google.com/file/d/1KEJPaCtNB-uOFjcEOOvxhD2bxW-xzXtJ/view?usp=drive_link"
78
-
79
- if not Path(dataset_path).exists():
80
- with st.spinner('Downloading dataset... This might take a few minutes...'):
81
- url = "https://drive.google.com/file/d/1KEJPaCtNB-uOFjcEOOvxhD2bxW-xzXtJ/view?usp=drive_link"
82
- gdown.download(url, dataset_path, quiet=False)
83
-
84
- data = pd.read_csv(dataset_path)
85
- data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
86
- return data
87
-
88
- @st.cache_data
89
- def compute_embeddings(_data, _model_manager):
90
- """Compute embeddings in batches"""
91
- embeddings = []
92
- batch_size = 32
93
-
94
- with st.progress(0) as progress_bar:
95
- for i in range(0, len(_data), batch_size):
96
- batch = _data['text'].iloc[i:i+batch_size]
97
- batch_embeddings = [_model_manager.generate_embedding(text) for text in batch]
98
- embeddings.extend(batch_embeddings)
99
- progress_bar.progress(min((i + batch_size) / len(_data), 1.0))
100
-
101
- return embeddings
102
-
103
- # History and Feedback Management
104
- def add_to_history(query, recommendations):
105
- """Add search to history"""
106
- history_entry = {
107
- 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
108
- 'query': query,
109
- 'recommendations': recommendations[['repo', 'path', 'url', 'similarity']].to_dict('records')
110
- }
111
- st.session_state.search_history.insert(0, history_entry)
112
-
113
- # Keep only last 10 searches
114
- if len(st.session_state.search_history) > 10:
115
- st.session_state.search_history.pop()
116
-
117
- def save_feedback(repo_id, feedback_type):
118
- """Save user feedback"""
119
- if repo_id not in st.session_state.feedback_data:
120
- st.session_state.feedback_data[repo_id] = {'likes': 0, 'dislikes': 0}
121
 
122
- if feedback_type == 'like':
123
- st.session_state.feedback_data[repo_id]['likes'] += 1
124
- else:
125
- st.session_state.feedback_data[repo_id]['dislikes'] += 1
126
 
127
- def get_recommendations(query, data, model_manager, top_n=5):
128
- """Get repository recommendations"""
129
- query_embedding = model_manager.generate_embedding(query)
130
- similarities = data['embedding'].apply(
131
- lambda x: cosine_similarity([query_embedding], [x])[0][0]
132
- )
133
- recommendations = data.assign(similarity=similarities)\
134
- .sort_values(by='similarity', ascending=False)\
135
- .head(top_n)
136
- return recommendations
137
-
138
- # Streamlit UI
139
- def main():
140
- st.title("Repository Recommender System 🚀")
141
-
142
- # Sidebar with history
143
- with st.sidebar:
144
- st.header("Search History 📜")
145
- if st.session_state.search_history:
146
- for entry in st.session_state.search_history:
147
- with st.expander(f"🔍 {entry['timestamp']}", expanded=False):
148
- st.write(f"Query: {entry['query']}")
149
- for rec in entry['recommendations'][:3]: # Show top 3
150
- st.write(f"- {rec['repo']} ({rec['similarity']:.2%})")
151
- else:
152
- st.info("No search history yet")
153
-
154
- # Main interface
155
- st.markdown("""
156
- **Welcome to the Enhanced Repo_Recommender!**
157
-
158
- Enter your project description to get personalized repository recommendations.
159
- New features:
160
- - 📜 Search history (check sidebar)
161
- - 👍 Repository feedback
162
- - ⚡ Optimized performance
163
- """)
164
-
165
- # Initialize managers
166
- model_manager = ModelManager()
167
- data = DataManager.load_dataset()
168
-
169
- # Compute embeddings if not already done
170
- if 'embedding' not in data.columns:
171
- data['embedding'] = DataManager.compute_embeddings(data, model_manager)
172
-
173
- # User input
174
- user_query = st.text_area(
175
- "Describe your project:",
176
- height=150,
177
- placeholder="Example: I need a machine learning project for customer churn prediction..."
178
- )
179
-
180
- # Get recommendations
181
- if st.button("Get Recommendations", type="primary"):
182
- if user_query.strip():
183
- with st.spinner("Finding relevant repositories..."):
184
- recommendations = get_recommendations(user_query, data, model_manager)
185
- add_to_history(user_query, recommendations)
186
-
187
- # Display recommendations
188
- st.markdown("### 🎯 Top Recommendations")
189
- for idx, row in recommendations.iterrows():
190
- with st.expander(f"Repository {idx + 1}: {row['repo']}", expanded=True):
191
- cols = st.columns([2, 1])
192
- with cols[0]:
193
- st.markdown(f"**Path:** `{row['path']}`")
194
- st.markdown(f"**Summary:** {row['summary']}")
195
- st.markdown(f"**URL:** [View Repository]({row['url']})")
196
- with cols[1]:
197
- st.metric("Similarity", f"{row['similarity']:.2%}")
198
-
199
- # Feedback buttons
200
- feedback_cols = st.columns(2)
201
- repo_id = f"{row['repo']}_{row['path']}"
202
-
203
- with feedback_cols[0]:
204
- if st.button("👍", key=f"like_{repo_id}"):
205
- save_feedback(repo_id, 'like')
206
- st.success("Thanks for your feedback!")
207
-
208
- with feedback_cols[1]:
209
- if st.button("👎", key=f"dislike_{repo_id}"):
210
- save_feedback(repo_id, 'dislike')
211
- st.success("Thanks for your feedback!")
212
-
213
- # Show feedback stats
214
- if repo_id in st.session_state.feedback_data:
215
- stats = st.session_state.feedback_data[repo_id]
216
- st.write(f"Likes: {stats['likes']} | Dislikes: {stats['dislikes']}")
217
-
218
- if row['docstring']:
219
- with st.expander("View Documentation"):
220
- st.markdown(row['docstring'])
221
- else:
222
- st.warning("Please enter a project description.")
223
-
224
- # Footer
225
- st.markdown("---")
226
- st.markdown("Made with 🤖 using CodeT5 and Streamlit")
227
-
228
- if __name__ == "__main__":
229
- main()
230
-
231
- import warnings
232
- warnings.filterwarnings('ignore')
233
-
234
- import streamlit as st
235
- import pandas as pd
236
- import numpy as np
237
- from sklearn.metrics.pairwise import cosine_similarity
238
- from transformers import AutoTokenizer, AutoModel
239
- import torch
240
- import gdown
241
- from pathlib import Path
242
- from datetime import datetime
243
-
244
- # Initialize session state
245
- if 'search_history' not in st.session_state:
246
- st.session_state.search_history = []
247
- if 'feedback_data' not in st.session_state:
248
- st.session_state.feedback_data = {}
249
-
250
- # Model Loading Optimization
251
  @st.cache_resource
252
- def load_model_and_tokenizer():
253
- """Optimized model loading with device placement"""
254
  model_name = "Salesforce/codet5-small"
255
  tokenizer = AutoTokenizer.from_pretrained(model_name)
256
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
257
- model = AutoModel.from_pretrained(model_name).to(device)
258
- model.eval() # Set model to evaluation mode
259
- return tokenizer, model, device
260
 
261
- @st.cache_resource
262
- def load_dataset():
263
- """Load and prepare dataset"""
264
- Path("data").mkdir(exist_ok=True)
265
- dataset_path = "/content/drive/MyDrive/practice_ml/filtered_dataset.csv"
266
-
267
- if not Path(dataset_path).exists():
268
- with st.spinner('Downloading dataset... This might take a few minutes...'):
269
- url = "https://drive.google.com/file/d/1pPYlUEtIA3bi8iLVKqzF-37sHoaOhTZz/view?usp=sharing"
270
- gdown.download(url, dataset_path, quiet=False)
271
-
272
- data = pd.read_csv(dataset_path)
273
- data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
274
- return data
275
-
276
- @st.cache_data
277
- def generate_embedding(_tokenizer, _model, _device, text, max_length=512):
278
- """Generate embedding for a single text"""
279
  with torch.no_grad():
280
- inputs = _tokenizer(
281
- text,
282
- return_tensors="pt",
283
- padding=True,
284
- truncation=True,
285
- max_length=max_length
286
- ).to(_device)
287
-
288
- outputs = _model.encoder(**inputs)
289
- embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
290
- return embedding
291
 
 
292
  @st.cache_data
293
- def compute_embeddings(_data, _tokenizer, _model, _device):
294
- """Compute embeddings in batches"""
295
- embeddings = []
296
- batch_size = 32
297
- texts = _data['text'].tolist()
298
-
299
- with st.progress(0) as progress_bar:
300
- progress_container = st.empty()
301
- for i in range(0, len(texts), batch_size):
302
- batch = texts[i:i+batch_size]
303
- batch_embeddings = [
304
- generate_embedding(_tokenizer, _model, _device, text)
305
- for text in batch
306
- ]
307
- embeddings.extend(batch_embeddings)
308
- progress_container.progress(min((i + batch_size) / len(texts), 1.0))
309
-
310
- return embeddings
311
-
312
- def add_to_history(query, recommendations):
313
- """Add search to history"""
314
- history_entry = {
315
- 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
316
- 'query': query,
317
- 'recommendations': recommendations[['repo', 'path', 'url', 'similarity']].to_dict('records')
318
- }
319
- st.session_state.search_history.insert(0, history_entry)
320
- if len(st.session_state.search_history) > 10:
321
- st.session_state.search_history.pop()
322
-
323
- def save_feedback(repo_id, feedback_type):
324
- """Save user feedback"""
325
- if repo_id not in st.session_state.feedback_data:
326
- st.session_state.feedback_data[repo_id] = {'likes': 0, 'dislikes': 0}
327
-
328
- if feedback_type == 'like':
329
- st.session_state.feedback_data[repo_id]['likes'] += 1
330
- else:
331
- st.session_state.feedback_data[repo_id]['dislikes'] += 1
332
-
333
- def get_recommendations(query, data, tokenizer, model, device, top_n=5):
334
- """Get repository recommendations"""
335
- query_embedding = generate_embedding(tokenizer, model, device, query)
336
-
337
- similarities = []
338
- for emb in data['embedding']:
339
- sim = cosine_similarity([query_embedding], [emb])[0][0]
340
- similarities.append(sim)
341
-
342
- recommendations = data.assign(similarity=similarities)\
343
- .sort_values(by='similarity', ascending=False)\
344
- .head(top_n)
345
- return recommendations
346
-
347
- def main():
348
- st.title("Repository Recommender System 🚀")
349
-
350
- # Sidebar with history
351
- with st.sidebar:
352
- st.header("Search History 📜")
353
- if st.session_state.search_history:
354
- for entry in st.session_state.search_history:
355
- with st.expander(f"🔍 {entry['timestamp']}", expanded=False):
356
- st.write(f"Query: {entry['query']}")
357
- for rec in entry['recommendations'][:3]:
358
- st.write(f"- {rec['repo']} ({rec['similarity']:.2%})")
359
  else:
360
- st.info("No search history yet")
361
-
362
- st.markdown("""
363
- **Welcome to the Enhanced Repo_Recommender!**
364
 
365
- Enter your project description to get personalized repository recommendations.
366
- New features:
367
- - 📜 Search history (check sidebar)
368
- - 👍 Repository feedback
369
- - ⚡ Optimized performance
370
- """)
371
 
372
  # Load resources
373
- with st.spinner("Loading model and data..."):
374
- tokenizer, model, device = load_model_and_tokenizer()
375
- data = load_dataset()
376
-
377
- # Compute embeddings if not already done
378
- if 'embedding' not in data.columns:
379
- data['embedding'] = compute_embeddings(data, tokenizer, model, device)
380
-
381
- # User input
382
- user_query = st.text_area(
383
- "Describe your project:",
384
- height=150,
385
- placeholder="Example: I need a machine learning project for customer churn prediction..."
386
- )
387
-
388
- # Get recommendations
389
- if st.button("Get Recommendations", type="primary"):
390
- if user_query.strip():
391
- with st.spinner("Finding relevant repositories..."):
392
- recommendations = get_recommendations(
393
- user_query, data, tokenizer, model, device
394
- )
395
- add_to_history(user_query, recommendations)
396
-
397
- # Display recommendations
398
- st.markdown("### 🎯 Top Recommendations")
399
- for idx, row in recommendations.iterrows():
400
- with st.expander(f"Repository {idx + 1}: {row['repo']}", expanded=True):
401
- cols = st.columns([2, 1])
402
- with cols[0]:
403
- st.markdown(f"**Path:** `{row['path']}`")
404
- st.markdown(f"**Summary:** {row['summary']}")
405
- st.markdown(f"**URL:** [View Repository]({row['url']})")
406
- with cols[1]:
407
- st.metric("Similarity", f"{row['similarity']:.2%}")
408
-
409
- # Feedback buttons
410
- feedback_cols = st.columns(2)
411
- repo_id = f"{row['repo']}_{row['path']}"
412
-
413
- with feedback_cols[0]:
414
- if st.button("👍", key=f"like_{repo_id}"):
415
- save_feedback(repo_id, 'like')
416
- st.success("Thanks for your feedback!")
417
-
418
- with feedback_cols[1]:
419
- if st.button("👎", key=f"dislike_{repo_id}"):
420
- save_feedback(repo_id, 'dislike')
421
- st.success("Thanks for your feedback!")
422
-
423
- # Show feedback stats
424
- if repo_id in st.session_state.feedback_data:
425
- stats = st.session_state.feedback_data[repo_id]
426
- st.write(f"Likes: {stats['likes']} | Dislikes: {stats['dislikes']}")
427
-
428
- if row['docstring']:
429
- with st.expander("View Documentation"):
430
- st.markdown(row['docstring'])
431
- else:
432
- st.warning("Please enter a project description.")
433
-
434
- # Footer
435
- st.markdown("---")
436
- st.markdown("Made with 🤖 using CodeT5 and Streamlit")
437
 
438
  if __name__ == "__main__":
439
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  from transformers import AutoTokenizer, AutoModel
6
  import torch
7
+ import requests
8
+ from datasets import load_dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ # Set page configuration
11
+ st.set_page_config(page_title="Repository Recommender", layout="wide")
 
 
12
 
13
+ # Load model and tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  @st.cache_resource
15
+ def load_model():
 
16
  model_name = "Salesforce/codet5-small"
17
  tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+ model = AutoModel.from_pretrained(model_name).to("cuda")
19
+ return tokenizer, model
 
 
20
 
21
+ def generate_embedding(text, tokenizer, model):
22
+ """Generate embeddings for a given text."""
23
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
24
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  with torch.no_grad():
26
+ outputs = model.encoder(**inputs)
27
+ return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
 
 
 
 
 
 
 
 
 
28
 
29
+ # Load dataset
30
  @st.cache_data
31
+ def load_data():
32
+ dataset = load_dataset("frankjosh/filtered_dataset", split="train")
33
+ df = pd.DataFrame(dataset).head(500) # Limit to 500 repositories
34
+ return df
35
+
36
+ def fetch_readme(repo_url):
37
+ """Fetch README file from GitHub repository."""
38
+ try:
39
+ readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
40
+ response = requests.get(readme_url)
41
+ if response.status_code == 200:
42
+ return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  else:
44
+ return "README not available."
45
+ except Exception as e:
46
+ return f"Error fetching README: {e}"
 
47
 
48
+ # Main application logic
49
+ def main():
50
+ st.title("Repository Recommender System")
51
+ st.write("Find Python repositories to learn production-level coding practices.")
 
 
52
 
53
  # Load resources
54
+ tokenizer, model = load_model()
55
+ data = load_data()
56
+
57
+ # Input user query
58
+ user_query = st.text_input("Describe your project or learning goal:",
59
+ "I am working on a project to recommend music using pandas and numpy.")
60
+ if user_query:
61
+ query_embedding = generate_embedding(user_query, tokenizer, model)
62
+
63
+ # Compute similarity
64
+ data['similarity'] = data['embedding'].apply(
65
+ lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0]
66
+ )
67
+
68
+ # Filter and sort recommendations
69
+ top_recommendations = (
70
+ data.sort_values(by='similarity', ascending=False)
71
+ .head(5)
72
+ )
73
+
74
+ # Display recommendations
75
+ st.subheader("Top Recommendations")
76
+ for idx, row in top_recommendations.iterrows():
77
+ st.markdown(f"### {row['repo']}")
78
+ st.write(f"**Path:** {row['path']}")
79
+ st.write(f"**Summary:** {row['summary']}")
80
+ st.write(f"**Similarity Score:** {row['similarity']:.2f}")
81
+ st.markdown(f"[Repository Link]({row['url']})")
82
+
83
+ # Fetch and display README
84
+ st.subheader("Repository README")
85
+ readme_content = fetch_readme(row['url'])
86
+ st.code(readme_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  if __name__ == "__main__":
89
+ main()