zamalali commited on
Commit
1796763
·
1 Parent(s): 9494afe

Refine DeepGit Lite description and improve error handling for GitHub API key and document embeddings

Browse files
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- import os
3
  import time
4
  import threading
5
  import logging
@@ -60,7 +59,7 @@ title = """
60
  """
61
 
62
  description = """<p align="center">
63
- DeepGit Lite is a streamlined version of DeepGit designed to perform advanced semantic research on GitHub repositories with faster response times. It uses query enhancement, dense retrieval via FAISS, activity analysis, and a final multi-factor ranking (combining semantic similarity, activity, and popularity) to deliver the best results.
64
  </p>"""
65
 
66
  consent_text = """
@@ -118,7 +117,6 @@ def parse_result_to_html(raw_result: str) -> str:
118
  <th>Title</th>
119
  <th>Link</th>
120
  <th>Semantic Similarity</th>
121
- <th>Activity Score</th>
122
  <th>Final Score</th>
123
  </tr>
124
  </thead>
@@ -138,7 +136,6 @@ def parse_result_to_html(raw_result: str) -> str:
138
  <td>{data.get('Title', '')}</td>
139
  <td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
140
  <td>{format_percent(data.get('Semantic Similarity', ''))}</td>
141
- <td>{float(data.get('Activity Score', 0)):.2f}</td>
142
  <td>{format_percent(data.get('Final Score', ''))}</td>
143
  </tr>
144
  """
 
1
  import gradio as gr
 
2
  import time
3
  import threading
4
  import logging
 
59
  """
60
 
61
  description = """<p align="center">
62
+ DeepGit Lite is a streamlined version of DeepGit designed for fast semantic search on GitHub repositories. It enhances your query, retrieves repositories using dense retrieval via FAISS, filters by star count, combines scores based on semantic similarity and popularity, and then provides a concise justification for the top results.
63
  </p>"""
64
 
65
  consent_text = """
 
117
  <th>Title</th>
118
  <th>Link</th>
119
  <th>Semantic Similarity</th>
 
120
  <th>Final Score</th>
121
  </tr>
122
  </thead>
 
136
  <td>{data.get('Title', '')}</td>
137
  <td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
138
  <td>{format_percent(data.get('Semantic Similarity', ''))}</td>
 
139
  <td>{format_percent(data.get('Final Score', ''))}</td>
140
  </tr>
141
  """
src/__pycache__/deepgit_lite.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/deepgit_lite.cpython-311.pyc and b/src/__pycache__/deepgit_lite.cpython-311.pyc differ
 
src/deepgit_lite.py CHANGED
@@ -5,7 +5,6 @@ import numpy as np
5
  import datetime
6
  from sentence_transformers import SentenceTransformer
7
  import faiss
8
- import getpass
9
  import math
10
  import logging
11
  from dotenv import load_dotenv
@@ -20,7 +19,7 @@ dotenv_path = Path(__file__).resolve().parent.parent / ".env"
20
  load_dotenv(dotenv_path=str(dotenv_path))
21
 
22
  if "GITHUB_API_KEY" not in os.environ:
23
- os.environ["GITHUB_API_KEY"] = getpass.getpass("Enter your GitHub API key: ")
24
 
25
  # ---------------------------
26
  # Logging Setup
@@ -49,7 +48,10 @@ Provide the refined query text."""
49
  ("human", prompt)
50
  ]
51
  result = llm_groq.invoke(messages)
52
- return result
 
 
 
53
 
54
  def justify_candidate(candidate, query):
55
  prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
@@ -64,7 +66,9 @@ Provide a concise justification:"""
64
  ("human", prompt)
65
  ]
66
  result = llm_groq.invoke(messages)
67
- return result
 
 
68
 
69
  # ---------------------------
70
  # GitHub API Helper Functions
@@ -172,15 +176,24 @@ def run_deepgit_lite(user_query):
172
  # Stage 1: Dense Retrieval with FAISS
173
  logger.info("Fetching repositories from GitHub...")
174
  repos = fetch_github_repositories(github_query)
 
 
 
 
175
  docs = [repo.get("combined_doc", "") for repo in repos]
176
  logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
177
  sem_model = SentenceTransformer("all-mpnet-base-v2")
178
  doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
179
-
 
 
 
 
 
180
  def normalize_embeddings(embeddings):
181
  norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
182
  return embeddings / (norms + 1e-10)
183
-
184
  doc_embeddings = normalize_embeddings(doc_embeddings)
185
  query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
186
  query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
 
5
  import datetime
6
  from sentence_transformers import SentenceTransformer
7
  import faiss
 
8
  import math
9
  import logging
10
  from dotenv import load_dotenv
 
19
  load_dotenv(dotenv_path=str(dotenv_path))
20
 
21
  if "GITHUB_API_KEY" not in os.environ:
22
+ raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
23
 
24
  # ---------------------------
25
  # Logging Setup
 
48
  ("human", prompt)
49
  ]
50
  result = llm_groq.invoke(messages)
51
+ # Extract text content if available
52
+ if hasattr(result, "content"):
53
+ return result.content
54
+ return str(result)
55
 
56
  def justify_candidate(candidate, query):
57
  prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
 
66
  ("human", prompt)
67
  ]
68
  result = llm_groq.invoke(messages)
69
+ if hasattr(result, "content"):
70
+ return result.content
71
+ return str(result)
72
 
73
  # ---------------------------
74
  # GitHub API Helper Functions
 
176
  # Stage 1: Dense Retrieval with FAISS
177
  logger.info("Fetching repositories from GitHub...")
178
  repos = fetch_github_repositories(github_query)
179
+ if not repos:
180
+ logger.error("No repositories found. Please refine your query.")
181
+ return "\nNo repositories found for your query. Please try a different query."
182
+
183
  docs = [repo.get("combined_doc", "") for repo in repos]
184
  logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
185
  sem_model = SentenceTransformer("all-mpnet-base-v2")
186
  doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
187
+
188
+ # Check if embeddings array is empty or 1-dimensional
189
+ if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
190
+ logger.error("No document embeddings generated. Aborting dense retrieval.")
191
+ return "\nFailed to generate document embeddings. Please try again."
192
+
193
  def normalize_embeddings(embeddings):
194
  norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
195
  return embeddings / (norms + 1e-10)
196
+
197
  doc_embeddings = normalize_embeddings(doc_embeddings)
198
  query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
199
  query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]