Spaces:
Running
on
Zero
Running
on
Zero
zamalali
commited on
Commit
·
1796763
1
Parent(s):
9494afe
Refine DeepGit Lite description and improve error handling for GitHub API key and document embeddings
Browse files- app.py +1 -4
- src/__pycache__/deepgit_lite.cpython-311.pyc +0 -0
- src/deepgit_lite.py +19 -6
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import gradio as gr
|
2 |
-
import os
|
3 |
import time
|
4 |
import threading
|
5 |
import logging
|
@@ -60,7 +59,7 @@ title = """
|
|
60 |
"""
|
61 |
|
62 |
description = """<p align="center">
|
63 |
-
DeepGit Lite is a streamlined version of DeepGit designed
|
64 |
</p>"""
|
65 |
|
66 |
consent_text = """
|
@@ -118,7 +117,6 @@ def parse_result_to_html(raw_result: str) -> str:
|
|
118 |
<th>Title</th>
|
119 |
<th>Link</th>
|
120 |
<th>Semantic Similarity</th>
|
121 |
-
<th>Activity Score</th>
|
122 |
<th>Final Score</th>
|
123 |
</tr>
|
124 |
</thead>
|
@@ -138,7 +136,6 @@ def parse_result_to_html(raw_result: str) -> str:
|
|
138 |
<td>{data.get('Title', '')}</td>
|
139 |
<td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
|
140 |
<td>{format_percent(data.get('Semantic Similarity', ''))}</td>
|
141 |
-
<td>{float(data.get('Activity Score', 0)):.2f}</td>
|
142 |
<td>{format_percent(data.get('Final Score', ''))}</td>
|
143 |
</tr>
|
144 |
"""
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import time
|
3 |
import threading
|
4 |
import logging
|
|
|
59 |
"""
|
60 |
|
61 |
description = """<p align="center">
|
62 |
+
DeepGit Lite is a streamlined version of DeepGit designed for fast semantic search on GitHub repositories. It enhances your query, retrieves repositories using dense retrieval via FAISS, filters by star count, combines scores based on semantic similarity and popularity, and then provides a concise justification for the top results.
|
63 |
</p>"""
|
64 |
|
65 |
consent_text = """
|
|
|
117 |
<th>Title</th>
|
118 |
<th>Link</th>
|
119 |
<th>Semantic Similarity</th>
|
|
|
120 |
<th>Final Score</th>
|
121 |
</tr>
|
122 |
</thead>
|
|
|
136 |
<td>{data.get('Title', '')}</td>
|
137 |
<td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
|
138 |
<td>{format_percent(data.get('Semantic Similarity', ''))}</td>
|
|
|
139 |
<td>{format_percent(data.get('Final Score', ''))}</td>
|
140 |
</tr>
|
141 |
"""
|
src/__pycache__/deepgit_lite.cpython-311.pyc
CHANGED
Binary files a/src/__pycache__/deepgit_lite.cpython-311.pyc and b/src/__pycache__/deepgit_lite.cpython-311.pyc differ
|
|
src/deepgit_lite.py
CHANGED
@@ -5,7 +5,6 @@ import numpy as np
|
|
5 |
import datetime
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
import faiss
|
8 |
-
import getpass
|
9 |
import math
|
10 |
import logging
|
11 |
from dotenv import load_dotenv
|
@@ -20,7 +19,7 @@ dotenv_path = Path(__file__).resolve().parent.parent / ".env"
|
|
20 |
load_dotenv(dotenv_path=str(dotenv_path))
|
21 |
|
22 |
if "GITHUB_API_KEY" not in os.environ:
|
23 |
-
|
24 |
|
25 |
# ---------------------------
|
26 |
# Logging Setup
|
@@ -49,7 +48,10 @@ Provide the refined query text."""
|
|
49 |
("human", prompt)
|
50 |
]
|
51 |
result = llm_groq.invoke(messages)
|
52 |
-
|
|
|
|
|
|
|
53 |
|
54 |
def justify_candidate(candidate, query):
|
55 |
prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
|
@@ -64,7 +66,9 @@ Provide a concise justification:"""
|
|
64 |
("human", prompt)
|
65 |
]
|
66 |
result = llm_groq.invoke(messages)
|
67 |
-
|
|
|
|
|
68 |
|
69 |
# ---------------------------
|
70 |
# GitHub API Helper Functions
|
@@ -172,15 +176,24 @@ def run_deepgit_lite(user_query):
|
|
172 |
# Stage 1: Dense Retrieval with FAISS
|
173 |
logger.info("Fetching repositories from GitHub...")
|
174 |
repos = fetch_github_repositories(github_query)
|
|
|
|
|
|
|
|
|
175 |
docs = [repo.get("combined_doc", "") for repo in repos]
|
176 |
logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
|
177 |
sem_model = SentenceTransformer("all-mpnet-base-v2")
|
178 |
doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
180 |
def normalize_embeddings(embeddings):
|
181 |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
182 |
return embeddings / (norms + 1e-10)
|
183 |
-
|
184 |
doc_embeddings = normalize_embeddings(doc_embeddings)
|
185 |
query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
|
186 |
query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
|
|
|
5 |
import datetime
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
import faiss
|
|
|
8 |
import math
|
9 |
import logging
|
10 |
from dotenv import load_dotenv
|
|
|
19 |
load_dotenv(dotenv_path=str(dotenv_path))
|
20 |
|
21 |
if "GITHUB_API_KEY" not in os.environ:
|
22 |
+
raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
|
23 |
|
24 |
# ---------------------------
|
25 |
# Logging Setup
|
|
|
48 |
("human", prompt)
|
49 |
]
|
50 |
result = llm_groq.invoke(messages)
|
51 |
+
# Extract text content if available
|
52 |
+
if hasattr(result, "content"):
|
53 |
+
return result.content
|
54 |
+
return str(result)
|
55 |
|
56 |
def justify_candidate(candidate, query):
|
57 |
prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
|
|
|
66 |
("human", prompt)
|
67 |
]
|
68 |
result = llm_groq.invoke(messages)
|
69 |
+
if hasattr(result, "content"):
|
70 |
+
return result.content
|
71 |
+
return str(result)
|
72 |
|
73 |
# ---------------------------
|
74 |
# GitHub API Helper Functions
|
|
|
176 |
# Stage 1: Dense Retrieval with FAISS
|
177 |
logger.info("Fetching repositories from GitHub...")
|
178 |
repos = fetch_github_repositories(github_query)
|
179 |
+
if not repos:
|
180 |
+
logger.error("No repositories found. Please refine your query.")
|
181 |
+
return "\nNo repositories found for your query. Please try a different query."
|
182 |
+
|
183 |
docs = [repo.get("combined_doc", "") for repo in repos]
|
184 |
logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
|
185 |
sem_model = SentenceTransformer("all-mpnet-base-v2")
|
186 |
doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
|
187 |
+
|
188 |
+
# Check if embeddings array is empty or 1-dimensional
|
189 |
+
if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
|
190 |
+
logger.error("No document embeddings generated. Aborting dense retrieval.")
|
191 |
+
return "\nFailed to generate document embeddings. Please try again."
|
192 |
+
|
193 |
def normalize_embeddings(embeddings):
|
194 |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
195 |
return embeddings / (norms + 1e-10)
|
196 |
+
|
197 |
doc_embeddings = normalize_embeddings(doc_embeddings)
|
198 |
query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
|
199 |
query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
|