File size: 3,579 Bytes
dc56243
 
 
 
 
 
5d9493d
 
dc56243
5d9493d
 
dc56243
5d9493d
dc56243
5d9493d
dc56243
 
178a171
 
ad91929
 
dc56243
ad91929
5d9493d
 
ad91929
dc56243
5d9493d
 
dc56243
5d9493d
dc56243
bdd7b82
5d9493d
 
bdd7b82
 
 
 
 
 
 
5d9493d
 
 
 
 
 
 
 
 
dc56243
5d9493d
 
 
dc56243
5d9493d
 
 
 
dc56243
 
ad91929
bdd7b82
5d9493d
 
 
 
 
ad91929
5d9493d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc56243
 
5d9493d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import requests
from datasets import load_dataset

# Set page configuration
st.set_page_config(page_title="Repository Recommender", layout="wide")

# Load model and tokenizer
@st.cache_resource
def load_model():
    model_name = "Salesforce/codet5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Check if GPU is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModel.from_pretrained(model_name).to(device)
    return tokenizer, model, device

def generate_embedding(text, tokenizer, model, device):
    """Generate embeddings for a given text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Load dataset
@st.cache_data
def load_data(tokenizer, model, device):
    dataset = load_dataset("frankjosh/filtered_dataset", split="train")
    df = pd.DataFrame(dataset).head(500)  # Limit to 500 repositories

    # Generate embeddings for each row
    def compute_embedding(row):
        text = f"{row['docstring']} {row['summary']}" if 'docstring' in row and 'summary' in row else ""
        return generate_embedding(text, tokenizer, model, device)

    df['embedding'] = df.apply(compute_embedding, axis=1)
    return df

def fetch_readme(repo_url):
    """Fetch README file from GitHub repository."""
    try:
        readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
        response = requests.get(readme_url)
        if response.status_code == 200:
            return response.text
        else:
            return "README not available."
    except Exception as e:
        return f"Error fetching README: {e}"

# Main application logic
def main():
    st.title("Repository Recommender System")
    st.write("Find Python repositories to learn production-level coding practices.")

    # Load resources
    tokenizer, model, device = load_model()
    data = load_data(tokenizer, model, device)

    # Input user query
    user_query = st.text_input("Describe your project or learning goal:",
                               "I am working on a project to recommend music using pandas and numpy.")
    if user_query:
        query_embedding = generate_embedding(user_query, tokenizer, model, device)

        # Compute similarity
        data['similarity'] = data['embedding'].apply(
            lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0]
        )

        # Filter and sort recommendations
        top_recommendations = (
            data.sort_values(by='similarity', ascending=False)
            .head(5)
        )

        # Display recommendations
        st.subheader("Top Recommendations")
        for idx, row in top_recommendations.iterrows():
            st.markdown(f"### {row['repo']}")
            st.write(f"**Path:** {row['path']}")
            st.write(f"**Summary:** {row['summary']}")
            st.write(f"**Similarity Score:** {row['similarity']:.2f}")
            st.markdown(f"[Repository Link]({row['url']})")

            # Fetch and display README
            st.subheader("Repository README")
            readme_content = fetch_readme(row['url'])
            st.code(readme_content)

if __name__ == "__main__":
    main()