Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""app.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1deINvEblsMkv9h0gJzuGB4uSamW0DMX5
|
8 |
+
"""
|
9 |
+
|
10 |
+
pip install streamlit transformers gdown torch pandas numpy
|
11 |
+
|
12 |
+
import warnings
|
13 |
+
warnings.filterwarnings('ignore')
|
14 |
+
|
15 |
+
import streamlit as st
|
16 |
+
import pandas as pd
|
17 |
+
import numpy as np
|
18 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
19 |
+
from transformers import AutoTokenizer, AutoModel
|
20 |
+
import torch
|
21 |
+
import gdown
|
22 |
+
from pathlib import Path
|
23 |
+
from datetime import datetime
|
24 |
+
import json
|
25 |
+
import torch.cuda
|
26 |
+
|
27 |
+
# Configure GPU if available
|
28 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
29 |
+
|
30 |
+
# Initialize session state
|
31 |
+
if 'history' not in st.session_state:
|
32 |
+
st.session_state.history = []
|
33 |
+
if 'feedback' not in st.session_state:
|
34 |
+
st.session_state.feedback = {}
|
35 |
+
|
36 |
+
# Step 1: Optimized Model Loading
|
37 |
+
@st.cache_resource
|
38 |
+
def load_model_and_tokenizer():
|
39 |
+
"""
|
40 |
+
Optimized model loading with GPU support and model quantization
|
41 |
+
"""
|
42 |
+
model_name = "Salesforce/codet5-small"
|
43 |
+
|
44 |
+
# Load tokenizer
|
45 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
46 |
+
|
47 |
+
# Load model with optimizations
|
48 |
+
model = AutoModel.from_pretrained(
|
49 |
+
model_name,
|
50 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
51 |
+
low_cpu_mem_usage=True
|
52 |
+
)
|
53 |
+
|
54 |
+
# Move model to GPU if available
|
55 |
+
model = model.to(device)
|
56 |
+
|
57 |
+
# Set to evaluation mode
|
58 |
+
model.eval()
|
59 |
+
|
60 |
+
return tokenizer, model
|
61 |
+
|
62 |
+
# Step 2: Optimized Dataset Loading
|
63 |
+
@st.cache_resource
|
64 |
+
def load_data():
|
65 |
+
"""
|
66 |
+
Load and prepare dataset with progress tracking
|
67 |
+
"""
|
68 |
+
Path("data").mkdir(exist_ok=True)
|
69 |
+
dataset_path = "/content/drive/MyDrive/practice_ml/filtered_dataset.parquet"
|
70 |
+
|
71 |
+
if not Path(dataset_path).exists():
|
72 |
+
with st.spinner('Downloading dataset... This might take a few minutes...'):
|
73 |
+
url = "https://drive.google.com/drive/folders/1dphd3vDKV46GwWKW5uo-MBl0GWGyCWUs?usp=drive_link"
|
74 |
+
gdown.download(url, dataset_path, quiet=False)
|
75 |
+
|
76 |
+
data = pd.read_parquet(dataset_path)
|
77 |
+
data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
|
78 |
+
return data
|
79 |
+
|
80 |
+
# Step 3: Optimized Embedding Generation
|
81 |
+
@st.cache_data
|
82 |
+
def generate_embedding(_model, tokenizer, text):
|
83 |
+
"""
|
84 |
+
Generate embeddings with optimized batch processing
|
85 |
+
"""
|
86 |
+
inputs = tokenizer(
|
87 |
+
text,
|
88 |
+
return_tensors="pt",
|
89 |
+
padding=True,
|
90 |
+
truncation=True,
|
91 |
+
max_length=512
|
92 |
+
).to(device)
|
93 |
+
|
94 |
+
with torch.no_grad():
|
95 |
+
outputs = _model.encoder(**inputs)
|
96 |
+
return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
|
97 |
+
|
98 |
+
def generate_case_study(repo_data):
|
99 |
+
"""
|
100 |
+
Generate a concise case study brief from repository data
|
101 |
+
"""
|
102 |
+
template = f"""
|
103 |
+
**Project Overview**: {repo_data['summary'][:50]}...
|
104 |
+
|
105 |
+
**Key Features**:
|
106 |
+
- Repository contains production-ready {repo_data['path'].split('/')[-1]} implementation
|
107 |
+
- {repo_data['docstring'][:50]}...
|
108 |
+
|
109 |
+
**Potential Applications**: This repository can be utilized for projects requiring {repo_data['summary'].split()[0:5]}...
|
110 |
+
|
111 |
+
**Implementation Complexity**: {'Medium' if len(repo_data['docstring']) > 500 else 'Low'}
|
112 |
+
|
113 |
+
**Integration Potential**: {'High' if 'api' in repo_data['text'].lower() or 'interface' in repo_data['text'].lower() else 'Medium'}
|
114 |
+
"""
|
115 |
+
return template[:150] + "..."
|
116 |
+
|
117 |
+
def save_feedback(repo_id, feedback_type):
|
118 |
+
"""
|
119 |
+
Save user feedback for a repository
|
120 |
+
"""
|
121 |
+
if repo_id not in st.session_state.feedback:
|
122 |
+
st.session_state.feedback[repo_id] = {'likes': 0, 'dislikes': 0}
|
123 |
+
st.session_state.feedback[repo_id][feedback_type] += 1
|
124 |
+
|
125 |
+
# Main App
|
126 |
+
st.title("Enhanced Repository Recommender System π")
|
127 |
+
|
128 |
+
# Sidebar for History and Stats
|
129 |
+
with st.sidebar:
|
130 |
+
st.header("π Search History")
|
131 |
+
if st.session_state.history:
|
132 |
+
for idx, item in enumerate(st.session_state.history[-5:]): # Show last 5 searches
|
133 |
+
with st.expander(f"Search {len(st.session_state.history)-idx}: {item['query'][:30]}..."):
|
134 |
+
st.write(f"Time: {item['timestamp']}")
|
135 |
+
st.write(f"Results: {len(item['results'])} repositories")
|
136 |
+
if st.button("Rerun this search", key=f"rerun_{idx}"):
|
137 |
+
st.session_state.rerun_query = item['query']
|
138 |
+
else:
|
139 |
+
st.write("No search history yet")
|
140 |
+
|
141 |
+
st.header("π Usage Statistics")
|
142 |
+
st.write(f"Total Searches: {len(st.session_state.history)}")
|
143 |
+
if st.session_state.feedback:
|
144 |
+
total_likes = sum(f['likes'] for f in st.session_state.feedback.values())
|
145 |
+
total_dislikes = sum(f['dislikes'] for f in st.session_state.feedback.values())
|
146 |
+
st.write(f"Total Likes: {total_likes}")
|
147 |
+
st.write(f"Total Dislikes: {total_dislikes}")
|
148 |
+
|
149 |
+
# Load resources
|
150 |
+
@st.cache_resource
|
151 |
+
def initialize_resources():
|
152 |
+
data = load_data()
|
153 |
+
tokenizer, model = load_model_and_tokenizer()
|
154 |
+
return data, tokenizer, model
|
155 |
+
|
156 |
+
data, tokenizer, model = initialize_resources()
|
157 |
+
|
158 |
+
# Main interface
|
159 |
+
user_query = st.text_area(
|
160 |
+
"Describe your project:",
|
161 |
+
height=150,
|
162 |
+
placeholder="Example: I need a machine learning project for customer churn prediction..."
|
163 |
+
)
|
164 |
+
|
165 |
+
# Search button and filters
|
166 |
+
col1, col2 = st.columns([2, 1])
|
167 |
+
with col1:
|
168 |
+
search_button = st.button("π Search Repositories", type="primary")
|
169 |
+
with col2:
|
170 |
+
top_n = st.selectbox("Number of results:", [3, 5, 10], index=1)
|
171 |
+
|
172 |
+
if search_button and user_query:
|
173 |
+
with st.spinner("Finding relevant repositories..."):
|
174 |
+
# Generate query embedding and get recommendations
|
175 |
+
query_embedding = generate_embedding(model, tokenizer, user_query)
|
176 |
+
data['similarity'] = data['embedding'].apply(
|
177 |
+
lambda x: cosine_similarity([query_embedding], [x])[0][0]
|
178 |
+
)
|
179 |
+
recommendations = data.nlargest(top_n, 'similarity')
|
180 |
+
|
181 |
+
# Save to history
|
182 |
+
st.session_state.history.append({
|
183 |
+
'query': user_query,
|
184 |
+
'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
185 |
+
'results': recommendations['repo'].tolist()
|
186 |
+
})
|
187 |
+
|
188 |
+
# Display recommendations
|
189 |
+
st.markdown("### π― Top Recommendations")
|
190 |
+
for idx, row in recommendations.iterrows():
|
191 |
+
with st.expander(f"Repository {idx + 1}: {row['repo']}", expanded=True):
|
192 |
+
# Repository details
|
193 |
+
col1, col2 = st.columns([2, 1])
|
194 |
+
with col1:
|
195 |
+
st.markdown(f"**URL:** [View Repository]({row['url']})")
|
196 |
+
st.markdown(f"**Path:** `{row['path']}`")
|
197 |
+
with col2:
|
198 |
+
st.metric("Match Score", f"{row['similarity']:.2%}")
|
199 |
+
|
200 |
+
# Feedback buttons
|
201 |
+
feedback_col1, feedback_col2 = st.columns(2)
|
202 |
+
with feedback_col1:
|
203 |
+
if st.button("π", key=f"like_{idx}"):
|
204 |
+
save_feedback(row['repo'], 'likes')
|
205 |
+
st.success("Thanks for your feedback!")
|
206 |
+
with feedback_col2:
|
207 |
+
if st.button("π", key=f"dislike_{idx}"):
|
208 |
+
save_feedback(row['repo'], 'dislikes')
|
209 |
+
st.success("Thanks for your feedback!")
|
210 |
+
|
211 |
+
# Case Study Tab
|
212 |
+
with st.expander("π Case Study Brief"):
|
213 |
+
st.markdown(generate_case_study(row))
|
214 |
+
|
215 |
+
# Documentation Tab
|
216 |
+
if row['docstring']:
|
217 |
+
with st.expander("π Documentation"):
|
218 |
+
st.markdown(row['docstring'])
|
219 |
+
|
220 |
+
# Footer
|
221 |
+
st.markdown("---")
|
222 |
+
st.markdown(
|
223 |
+
"""
|
224 |
+
Made with π€ using CodeT5 and Streamlit |
|
225 |
+
GPU Status: {'π’ Enabled' if torch.cuda.is_available() else 'π΄ Disabled'} |
|
226 |
+
Model: CodeT5-Small
|
227 |
+
"""
|
228 |
+
)
|