frankjosh commited on
Commit
c26ed9b
Β·
verified Β·
1 Parent(s): 2256830

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +228 -0
app.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1deINvEblsMkv9h0gJzuGB4uSamW0DMX5
8
+ """
9
+
10
+ pip install streamlit transformers gdown torch pandas numpy
11
+
12
+ import warnings
13
+ warnings.filterwarnings('ignore')
14
+
15
+ import streamlit as st
16
+ import pandas as pd
17
+ import numpy as np
18
+ from sklearn.metrics.pairwise import cosine_similarity
19
+ from transformers import AutoTokenizer, AutoModel
20
+ import torch
21
+ import gdown
22
+ from pathlib import Path
23
+ from datetime import datetime
24
+ import json
25
+ import torch.cuda
26
+
27
+ # Configure GPU if available
28
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
29
+
30
+ # Initialize session state
31
+ if 'history' not in st.session_state:
32
+ st.session_state.history = []
33
+ if 'feedback' not in st.session_state:
34
+ st.session_state.feedback = {}
35
+
36
+ # Step 1: Optimized Model Loading
37
+ @st.cache_resource
38
+ def load_model_and_tokenizer():
39
+ """
40
+ Optimized model loading with GPU support and model quantization
41
+ """
42
+ model_name = "Salesforce/codet5-small"
43
+
44
+ # Load tokenizer
45
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
46
+
47
+ # Load model with optimizations
48
+ model = AutoModel.from_pretrained(
49
+ model_name,
50
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
51
+ low_cpu_mem_usage=True
52
+ )
53
+
54
+ # Move model to GPU if available
55
+ model = model.to(device)
56
+
57
+ # Set to evaluation mode
58
+ model.eval()
59
+
60
+ return tokenizer, model
61
+
62
+ # Step 2: Optimized Dataset Loading
63
+ @st.cache_resource
64
+ def load_data():
65
+ """
66
+ Load and prepare dataset with progress tracking
67
+ """
68
+ Path("data").mkdir(exist_ok=True)
69
+ dataset_path = "/content/drive/MyDrive/practice_ml/filtered_dataset.parquet"
70
+
71
+ if not Path(dataset_path).exists():
72
+ with st.spinner('Downloading dataset... This might take a few minutes...'):
73
+ url = "https://drive.google.com/drive/folders/1dphd3vDKV46GwWKW5uo-MBl0GWGyCWUs?usp=drive_link"
74
+ gdown.download(url, dataset_path, quiet=False)
75
+
76
+ data = pd.read_parquet(dataset_path)
77
+ data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
78
+ return data
79
+
80
+ # Step 3: Optimized Embedding Generation
81
+ @st.cache_data
82
+ def generate_embedding(_model, tokenizer, text):
83
+ """
84
+ Generate embeddings with optimized batch processing
85
+ """
86
+ inputs = tokenizer(
87
+ text,
88
+ return_tensors="pt",
89
+ padding=True,
90
+ truncation=True,
91
+ max_length=512
92
+ ).to(device)
93
+
94
+ with torch.no_grad():
95
+ outputs = _model.encoder(**inputs)
96
+ return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
97
+
98
+ def generate_case_study(repo_data):
99
+ """
100
+ Generate a concise case study brief from repository data
101
+ """
102
+ template = f"""
103
+ **Project Overview**: {repo_data['summary'][:50]}...
104
+
105
+ **Key Features**:
106
+ - Repository contains production-ready {repo_data['path'].split('/')[-1]} implementation
107
+ - {repo_data['docstring'][:50]}...
108
+
109
+ **Potential Applications**: This repository can be utilized for projects requiring {repo_data['summary'].split()[0:5]}...
110
+
111
+ **Implementation Complexity**: {'Medium' if len(repo_data['docstring']) > 500 else 'Low'}
112
+
113
+ **Integration Potential**: {'High' if 'api' in repo_data['text'].lower() or 'interface' in repo_data['text'].lower() else 'Medium'}
114
+ """
115
+ return template[:150] + "..."
116
+
117
+ def save_feedback(repo_id, feedback_type):
118
+ """
119
+ Save user feedback for a repository
120
+ """
121
+ if repo_id not in st.session_state.feedback:
122
+ st.session_state.feedback[repo_id] = {'likes': 0, 'dislikes': 0}
123
+ st.session_state.feedback[repo_id][feedback_type] += 1
124
+
125
+ # Main App
126
+ st.title("Enhanced Repository Recommender System πŸš€")
127
+
128
+ # Sidebar for History and Stats
129
+ with st.sidebar:
130
+ st.header("πŸ“Š Search History")
131
+ if st.session_state.history:
132
+ for idx, item in enumerate(st.session_state.history[-5:]): # Show last 5 searches
133
+ with st.expander(f"Search {len(st.session_state.history)-idx}: {item['query'][:30]}..."):
134
+ st.write(f"Time: {item['timestamp']}")
135
+ st.write(f"Results: {len(item['results'])} repositories")
136
+ if st.button("Rerun this search", key=f"rerun_{idx}"):
137
+ st.session_state.rerun_query = item['query']
138
+ else:
139
+ st.write("No search history yet")
140
+
141
+ st.header("πŸ“ˆ Usage Statistics")
142
+ st.write(f"Total Searches: {len(st.session_state.history)}")
143
+ if st.session_state.feedback:
144
+ total_likes = sum(f['likes'] for f in st.session_state.feedback.values())
145
+ total_dislikes = sum(f['dislikes'] for f in st.session_state.feedback.values())
146
+ st.write(f"Total Likes: {total_likes}")
147
+ st.write(f"Total Dislikes: {total_dislikes}")
148
+
149
+ # Load resources
150
+ @st.cache_resource
151
+ def initialize_resources():
152
+ data = load_data()
153
+ tokenizer, model = load_model_and_tokenizer()
154
+ return data, tokenizer, model
155
+
156
+ data, tokenizer, model = initialize_resources()
157
+
158
+ # Main interface
159
+ user_query = st.text_area(
160
+ "Describe your project:",
161
+ height=150,
162
+ placeholder="Example: I need a machine learning project for customer churn prediction..."
163
+ )
164
+
165
+ # Search button and filters
166
+ col1, col2 = st.columns([2, 1])
167
+ with col1:
168
+ search_button = st.button("πŸ” Search Repositories", type="primary")
169
+ with col2:
170
+ top_n = st.selectbox("Number of results:", [3, 5, 10], index=1)
171
+
172
+ if search_button and user_query:
173
+ with st.spinner("Finding relevant repositories..."):
174
+ # Generate query embedding and get recommendations
175
+ query_embedding = generate_embedding(model, tokenizer, user_query)
176
+ data['similarity'] = data['embedding'].apply(
177
+ lambda x: cosine_similarity([query_embedding], [x])[0][0]
178
+ )
179
+ recommendations = data.nlargest(top_n, 'similarity')
180
+
181
+ # Save to history
182
+ st.session_state.history.append({
183
+ 'query': user_query,
184
+ 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
185
+ 'results': recommendations['repo'].tolist()
186
+ })
187
+
188
+ # Display recommendations
189
+ st.markdown("### 🎯 Top Recommendations")
190
+ for idx, row in recommendations.iterrows():
191
+ with st.expander(f"Repository {idx + 1}: {row['repo']}", expanded=True):
192
+ # Repository details
193
+ col1, col2 = st.columns([2, 1])
194
+ with col1:
195
+ st.markdown(f"**URL:** [View Repository]({row['url']})")
196
+ st.markdown(f"**Path:** `{row['path']}`")
197
+ with col2:
198
+ st.metric("Match Score", f"{row['similarity']:.2%}")
199
+
200
+ # Feedback buttons
201
+ feedback_col1, feedback_col2 = st.columns(2)
202
+ with feedback_col1:
203
+ if st.button("πŸ‘", key=f"like_{idx}"):
204
+ save_feedback(row['repo'], 'likes')
205
+ st.success("Thanks for your feedback!")
206
+ with feedback_col2:
207
+ if st.button("πŸ‘Ž", key=f"dislike_{idx}"):
208
+ save_feedback(row['repo'], 'dislikes')
209
+ st.success("Thanks for your feedback!")
210
+
211
+ # Case Study Tab
212
+ with st.expander("πŸ“‘ Case Study Brief"):
213
+ st.markdown(generate_case_study(row))
214
+
215
+ # Documentation Tab
216
+ if row['docstring']:
217
+ with st.expander("πŸ“š Documentation"):
218
+ st.markdown(row['docstring'])
219
+
220
+ # Footer
221
+ st.markdown("---")
222
+ st.markdown(
223
+ """
224
+ Made with πŸ€– using CodeT5 and Streamlit |
225
+ GPU Status: {'🟒 Enabled' if torch.cuda.is_available() else 'πŸ”΄ Disabled'} |
226
+ Model: CodeT5-Small
227
+ """
228
+ )