Spaces:

zamal
/

DeepGit-lite

Running on Zero

App Files Files Community

zamalali commited on Apr 4

Commit

94ed277

1 Parent(s): 27298b1

Refactor app.py and main.py for improved readability and functionality; add environment variable loading

Browse files

Files changed (3) hide show

__pycache__/main.cpython-311.pyc +0 -0
app.py +4 -8
main.py +33 -20

__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (21.6 kB). View file

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import spaces
 import gradio as gr
 import time
-from gradio.themes.utils import sizes
 import threading
 import logging
-from main import run_repository_ranking  # Your repository ranking function
 # ---------------------------
 # Global Logging Buffer Setup
 # ---------------------------
@@ -45,8 +44,7 @@ def parse_result_to_html(raw_result: str) -> str:
     Only the top 10 results are displayed.
     """
     entries = raw_result.strip().split("Final Rank:")
-    # Only use the first 10 entries (if available)
-    entries = entries[1:11]
     if not entries:
         return "<p>No repositories found for your query.</p>"
     html = """
@@ -163,8 +161,6 @@ with gr.Blocks(
         elem_id="header"
     )
-    # Centered main container for inputs and outputs.
     with gr.Column(elem_id="main-container"):
         research_input = gr.Textbox(
             label="Research Query",

 import spaces
 import gradio as gr
 import time
 import threading
 import logging
+from gradio.themes.utils import sizes
+from main import run_repository_ranking  # Import the repository ranking function
 # ---------------------------
 # Global Logging Buffer Setup
 # ---------------------------
     Only the top 10 results are displayed.
     """
     entries = raw_result.strip().split("Final Rank:")
+    entries = entries[1:11]  # Use only the first 10 entries
     if not entries:
         return "<p>No repositories found for your query.</p>"
     html = """
         elem_id="header"
     )
     with gr.Column(elem_id="main-container"):
         research_input = gr.Textbox(
             label="Research Query",

main.py CHANGED Viewed

@@ -6,7 +6,12 @@ import faiss
 import re
 import logging
 from pathlib import Path
 from dotenv import load_dotenv
 from sentence_transformers import SentenceTransformer, CrossEncoder
 from langchain_groq import ChatGroq
 from langchain_core.prompts import ChatPromptTemplate
@@ -18,14 +23,21 @@ except ImportError:
     BM25Okapi = None
 # ---------------------------
-# Environment Setup
 # ---------------------------
-load_dotenv()
 CROSS_ENCODER_MODEL = os.getenv("CROSS_ENCODER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
-# Setup a persistent session for GitHub API requests.
 session = requests.Session()
 session.headers.update({
-    "Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
     "Accept": "application/vnd.github.v3+json"
 })
@@ -37,7 +49,9 @@ llm = ChatGroq(
     temperature=0.3,
     max_tokens=512,
     max_retries=3,
 )
 prompt = ChatPromptTemplate.from_messages([
     ("system",
      """You are a GitHub search optimization expert.
@@ -115,7 +129,7 @@ def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str
 # ---------------------------
 # GitHub API Helper Functions
 # ---------------------------
-def fetch_readme_content(repo_full_name):
     readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
     response = session.get(readme_url)
     if response.status_code == 200:
@@ -126,7 +140,7 @@ def fetch_readme_content(repo_full_name):
             return ""
     return ""
-def fetch_markdown_contents(repo_full_name):
     url = f"https://api.github.com/repos/{repo_full_name}/contents"
     response = session.get(url)
     contents = ""
@@ -141,12 +155,12 @@ def fetch_markdown_contents(repo_full_name):
                         contents += "\n" + file_resp.text
     return contents
-def fetch_all_markdown(repo_full_name):
     readme = fetch_readme_content(repo_full_name)
     other_md = fetch_markdown_contents(repo_full_name)
     return readme + "\n" + other_md
-def fetch_github_repositories(query, max_results=10):
     url = "https://api.github.com/search/repositories"
     params = {
         "q": query,
@@ -173,12 +187,13 @@ def fetch_github_repositories(query, max_results=10):
 # Dense Retrieval Model Setup
 # ---------------------------
 try:
     model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
 except Exception as e:
     print("Error initializing GPU for SentenceTransformer; falling back to CPU:", e)
     model = SentenceTransformer('all-mpnet-base-v2', device='cpu')
-def robust_min_max_norm(scores):
     min_val = scores.min()
     max_val = scores.max()
     if max_val - min_val < 1e-10:
@@ -188,17 +203,18 @@ def robust_min_max_norm(scores):
 # ---------------------------
 # Cross-Encoder Re-Ranking Function
 # ---------------------------
-def cross_encoder_rerank_candidates(candidates, query, model_name, top_n=10):
     try:
         cross_encoder = CrossEncoder(model_name, device='cuda')
     except Exception as e:
         print("Error initializing CrossEncoder on GPU; falling back to CPU:", e)
-        cross_encoder = CrossEncoder(model_name, device='cpu')
     CHUNK_SIZE = 2000
     MAX_DOC_LENGTH = 5000
     MIN_DOC_LENGTH = 200
-    def split_text(text, chunk_size=CHUNK_SIZE):
         return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
     for candidate in candidates:
@@ -213,13 +229,13 @@ def cross_encoder_rerank_candidates(candidates, query, model_name, top_n=10):
                 chunks = split_text(doc)
                 pairs = [[query, chunk] for chunk in chunks]
                 scores = cross_encoder.predict(pairs)
-                max_score = np.max(scores) if len(scores) > 0 else 0.0
-                avg_score = np.mean(scores) if len(scores) > 0 else 0.0
                 candidate["cross_encoder_score"] = float(0.5 * max_score + 0.5 * avg_score)
         except Exception as e:
             logging.error(f"Error scoring candidate {candidate.get('link', 'unknown')}: {e}")
             candidate["cross_encoder_score"] = 0.0
     all_scores = [candidate["cross_encoder_score"] for candidate in candidates]
     if all_scores:
         min_score = min(all_scores)
@@ -227,7 +243,6 @@ def cross_encoder_rerank_candidates(candidates, query, model_name, top_n=10):
             for candidate in candidates:
                 candidate["cross_encoder_score"] += -min_score
-    # Do not sort solely by cross-encoder score; we want to combine metrics.
     return candidates
 # ---------------------------
@@ -318,11 +333,9 @@ def run_repository_ranking(query: str) -> str:
     # Step 9: Compute cross-encoder scores for the top candidates.
     top_candidates = ranked_repositories[:100] if len(ranked_repositories) > 100 else ranked_repositories
-    # Update candidates with cross-encoder scores.
     cross_encoder_rerank_candidates(top_candidates, query, model_name=CROSS_ENCODER_MODEL, top_n=len(top_candidates))
-    # Now combine both metrics: final_score = w1 * combined_score + w2 * cross_encoder_score.
-    # Adjust weights as needed (here 0.7 for combined, 0.3 for cross-encoder).
     w1 = 0.7
     w2 = 0.3
     for candidate in top_candidates:

 import re
 import logging
 from pathlib import Path
+# For local development, load environment variables from a .env file.
+# In HuggingFace Spaces, secrets are automatically available as environment variables.
 from dotenv import load_dotenv
+load_dotenv()
 from sentence_transformers import SentenceTransformer, CrossEncoder
 from langchain_groq import ChatGroq
 from langchain_core.prompts import ChatPromptTemplate
     BM25Okapi = None
 # ---------------------------
+# Environment Variables & Setup
 # ---------------------------
+# GitHub API key (required for GitHub API calls)
+GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")
+# GROQ API key (if required by ChatGroq)
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+# HuggingFace token (if you need it to load private models from HuggingFace)
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 CROSS_ENCODER_MODEL = os.getenv("CROSS_ENCODER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
+# Set up a persistent session for GitHub API requests.
 session = requests.Session()
 session.headers.update({
+    "Authorization": f"token {GITHUB_API_KEY}",
     "Accept": "application/vnd.github.v3+json"
 })
     temperature=0.3,
     max_tokens=512,
     max_retries=3,
+    api_key=GROQ_API_KEY  # Pass GROQ_API_KEY if the ChatGroq library supports it.
 )
 prompt = ChatPromptTemplate.from_messages([
     ("system",
      """You are a GitHub search optimization expert.
 # ---------------------------
 # GitHub API Helper Functions
 # ---------------------------
+def fetch_readme_content(repo_full_name: str) -> str:
     readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
     response = session.get(readme_url)
     if response.status_code == 200:
             return ""
     return ""
+def fetch_markdown_contents(repo_full_name: str) -> str:
     url = f"https://api.github.com/repos/{repo_full_name}/contents"
     response = session.get(url)
     contents = ""
                         contents += "\n" + file_resp.text
     return contents
+def fetch_all_markdown(repo_full_name: str) -> str:
     readme = fetch_readme_content(repo_full_name)
     other_md = fetch_markdown_contents(repo_full_name)
     return readme + "\n" + other_md
+def fetch_github_repositories(query: str, max_results: int = 10) -> list:
     url = "https://api.github.com/search/repositories"
     params = {
         "q": query,
 # Dense Retrieval Model Setup
 # ---------------------------
 try:
+    # If using a GPU-enabled model, the HuggingFace token can be used for private models.
     model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
 except Exception as e:
     print("Error initializing GPU for SentenceTransformer; falling back to CPU:", e)
     model = SentenceTransformer('all-mpnet-base-v2', device='cpu')
+def robust_min_max_norm(scores: np.ndarray) -> np.ndarray:
     min_val = scores.min()
     max_val = scores.max()
     if max_val - min_val < 1e-10:
 # ---------------------------
 # Cross-Encoder Re-Ranking Function
 # ---------------------------
+def cross_encoder_rerank_candidates(candidates: list, query: str, model_name: str, top_n: int = 10) -> list:
     try:
         cross_encoder = CrossEncoder(model_name, device='cuda')
     except Exception as e:
         print("Error initializing CrossEncoder on GPU; falling back to CPU:", e)
+        cross_encoder = CrossEncoder(model_name, device='cpu')
     CHUNK_SIZE = 2000
     MAX_DOC_LENGTH = 5000
     MIN_DOC_LENGTH = 200
+    def split_text(text: str, chunk_size: int = CHUNK_SIZE) -> list:
         return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
     for candidate in candidates:
                 chunks = split_text(doc)
                 pairs = [[query, chunk] for chunk in chunks]
                 scores = cross_encoder.predict(pairs)
+                max_score = np.max(scores) if scores else 0.0
+                avg_score = np.mean(scores) if scores else 0.0
                 candidate["cross_encoder_score"] = float(0.5 * max_score + 0.5 * avg_score)
         except Exception as e:
             logging.error(f"Error scoring candidate {candidate.get('link', 'unknown')}: {e}")
             candidate["cross_encoder_score"] = 0.0
     all_scores = [candidate["cross_encoder_score"] for candidate in candidates]
     if all_scores:
         min_score = min(all_scores)
             for candidate in candidates:
                 candidate["cross_encoder_score"] += -min_score
     return candidates
 # ---------------------------
     # Step 9: Compute cross-encoder scores for the top candidates.
     top_candidates = ranked_repositories[:100] if len(ranked_repositories) > 100 else ranked_repositories
     cross_encoder_rerank_candidates(top_candidates, query, model_name=CROSS_ENCODER_MODEL, top_n=len(top_candidates))
+    # Combine both metrics: final_score = w1 * combined_score + w2 * cross_encoder_score.
     w1 = 0.7
     w2 = 0.3
     for candidate in top_candidates: