Spaces:
Sleeping
Sleeping
File size: 3,579 Bytes
dc56243 5d9493d dc56243 5d9493d dc56243 5d9493d dc56243 5d9493d dc56243 178a171 ad91929 dc56243 ad91929 5d9493d ad91929 dc56243 5d9493d dc56243 5d9493d dc56243 bdd7b82 5d9493d bdd7b82 5d9493d dc56243 5d9493d dc56243 5d9493d dc56243 ad91929 bdd7b82 5d9493d ad91929 5d9493d dc56243 5d9493d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import requests
from datasets import load_dataset
# Set page configuration
st.set_page_config(page_title="Repository Recommender", layout="wide")
# Load model and tokenizer
@st.cache_resource
def load_model():
model_name = "Salesforce/codet5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)
return tokenizer, model, device
def generate_embedding(text, tokenizer, model, device):
"""Generate embeddings for a given text."""
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.encoder(**inputs)
return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
# Load dataset
@st.cache_data
def load_data(tokenizer, model, device):
dataset = load_dataset("frankjosh/filtered_dataset", split="train")
df = pd.DataFrame(dataset).head(500) # Limit to 500 repositories
# Generate embeddings for each row
def compute_embedding(row):
text = f"{row['docstring']} {row['summary']}" if 'docstring' in row and 'summary' in row else ""
return generate_embedding(text, tokenizer, model, device)
df['embedding'] = df.apply(compute_embedding, axis=1)
return df
def fetch_readme(repo_url):
"""Fetch README file from GitHub repository."""
try:
readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
response = requests.get(readme_url)
if response.status_code == 200:
return response.text
else:
return "README not available."
except Exception as e:
return f"Error fetching README: {e}"
# Main application logic
def main():
st.title("Repository Recommender System")
st.write("Find Python repositories to learn production-level coding practices.")
# Load resources
tokenizer, model, device = load_model()
data = load_data(tokenizer, model, device)
# Input user query
user_query = st.text_input("Describe your project or learning goal:",
"I am working on a project to recommend music using pandas and numpy.")
if user_query:
query_embedding = generate_embedding(user_query, tokenizer, model, device)
# Compute similarity
data['similarity'] = data['embedding'].apply(
lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0]
)
# Filter and sort recommendations
top_recommendations = (
data.sort_values(by='similarity', ascending=False)
.head(5)
)
# Display recommendations
st.subheader("Top Recommendations")
for idx, row in top_recommendations.iterrows():
st.markdown(f"### {row['repo']}")
st.write(f"**Path:** {row['path']}")
st.write(f"**Summary:** {row['summary']}")
st.write(f"**Similarity Score:** {row['similarity']:.2f}")
st.markdown(f"[Repository Link]({row['url']})")
# Fetch and display README
st.subheader("Repository README")
readme_content = fetch_readme(row['url'])
st.code(readme_content)
if __name__ == "__main__":
main()
|