Spaces:

zamal
/

DeepGit-lite

Running on Zero

App Files Files Community

zamalali commited on Mar 31

Commit

1796763

1 Parent(s): 9494afe

Refine DeepGit Lite description and improve error handling for GitHub API key and document embeddings

Browse files

Files changed (3) hide show

app.py +1 -4
src/__pycache__/deepgit_lite.cpython-311.pyc +0 -0
src/deepgit_lite.py +19 -6

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import gradio as gr
-import os
 import time
 import threading
 import logging
@@ -60,7 +59,7 @@ title = """
 """
 description = """<p align="center">
-DeepGit Lite is a streamlined version of DeepGit designed to perform advanced semantic research on GitHub repositories with faster response times. It uses query enhancement, dense retrieval via FAISS, activity analysis, and a final multi-factor ranking (combining semantic similarity, activity, and popularity) to deliver the best results.
 </p>"""
 consent_text = """
@@ -118,7 +117,6 @@ def parse_result_to_html(raw_result: str) -> str:
                 <th>Title</th>
                 <th>Link</th>
                 <th>Semantic Similarity</th>
-                <th>Activity Score</th>
                 <th>Final Score</th>
             </tr>
         </thead>
@@ -138,7 +136,6 @@ def parse_result_to_html(raw_result: str) -> str:
                 <td>{data.get('Title', '')}</td>
                 <td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
                 <td>{format_percent(data.get('Semantic Similarity', ''))}</td>
-                <td>{float(data.get('Activity Score', 0)):.2f}</td>
                 <td>{format_percent(data.get('Final Score', ''))}</td>
             </tr>
         """

 import gradio as gr
 import time
 import threading
 import logging
 """
 description = """<p align="center">
+DeepGit Lite is a streamlined version of DeepGit designed for fast semantic search on GitHub repositories. It enhances your query, retrieves repositories using dense retrieval via FAISS, filters by star count, combines scores based on semantic similarity and popularity, and then provides a concise justification for the top results.
 </p>"""
 consent_text = """
                 <th>Title</th>
                 <th>Link</th>
                 <th>Semantic Similarity</th>
                 <th>Final Score</th>
             </tr>
         </thead>
                 <td>{data.get('Title', '')}</td>
                 <td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
                 <td>{format_percent(data.get('Semantic Similarity', ''))}</td>
                 <td>{format_percent(data.get('Final Score', ''))}</td>
             </tr>
         """

src/__pycache__/deepgit_lite.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/deepgit_lite.cpython-311.pyc and b/src/__pycache__/deepgit_lite.cpython-311.pyc differ

src/deepgit_lite.py CHANGED Viewed

@@ -5,7 +5,6 @@ import numpy as np
 import datetime
 from sentence_transformers import SentenceTransformer
 import faiss
-import getpass
 import math
 import logging
 from dotenv import load_dotenv
@@ -20,7 +19,7 @@ dotenv_path = Path(__file__).resolve().parent.parent / ".env"
 load_dotenv(dotenv_path=str(dotenv_path))
 if "GITHUB_API_KEY" not in os.environ:
-    os.environ["GITHUB_API_KEY"] = getpass.getpass("Enter your GitHub API key: ")
 # ---------------------------
 # Logging Setup
@@ -49,7 +48,10 @@ Provide the refined query text."""
         ("human", prompt)
     ]
     result = llm_groq.invoke(messages)
-    return result
 def justify_candidate(candidate, query):
     prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
@@ -64,7 +66,9 @@ Provide a concise justification:"""
         ("human", prompt)
     ]
     result = llm_groq.invoke(messages)
-    return result
 # ---------------------------
 # GitHub API Helper Functions
@@ -172,15 +176,24 @@ def run_deepgit_lite(user_query):
     # Stage 1: Dense Retrieval with FAISS
     logger.info("Fetching repositories from GitHub...")
     repos = fetch_github_repositories(github_query)
     docs = [repo.get("combined_doc", "") for repo in repos]
     logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
     sem_model = SentenceTransformer("all-mpnet-base-v2")
     doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
     def normalize_embeddings(embeddings):
         norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
         return embeddings / (norms + 1e-10)
     doc_embeddings = normalize_embeddings(doc_embeddings)
     query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
     query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]

 import datetime
 from sentence_transformers import SentenceTransformer
 import faiss
 import math
 import logging
 from dotenv import load_dotenv
 load_dotenv(dotenv_path=str(dotenv_path))
 if "GITHUB_API_KEY" not in os.environ:
+    raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
 # ---------------------------
 # Logging Setup
         ("human", prompt)
     ]
     result = llm_groq.invoke(messages)
+    # Extract text content if available
+    if hasattr(result, "content"):
+        return result.content
+    return str(result)
 def justify_candidate(candidate, query):
     prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
         ("human", prompt)
     ]
     result = llm_groq.invoke(messages)
+    if hasattr(result, "content"):
+        return result.content
+    return str(result)
 # ---------------------------
 # GitHub API Helper Functions
     # Stage 1: Dense Retrieval with FAISS
     logger.info("Fetching repositories from GitHub...")
     repos = fetch_github_repositories(github_query)
+    if not repos:
+        logger.error("No repositories found. Please refine your query.")
+        return "\nNo repositories found for your query. Please try a different query."
     docs = [repo.get("combined_doc", "") for repo in repos]
     logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
     sem_model = SentenceTransformer("all-mpnet-base-v2")
     doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
+    # Check if embeddings array is empty or 1-dimensional
+    if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
+        logger.error("No document embeddings generated. Aborting dense retrieval.")
+        return "\nFailed to generate document embeddings. Please try again."
     def normalize_embeddings(embeddings):
         norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
         return embeddings / (norms + 1e-10)
     doc_embeddings = normalize_embeddings(doc_embeddings)
     query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
     query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]