Spaces:
Running
on
Zero
Running
on
Zero
zamalali
commited on
Commit
·
777083e
1
Parent(s):
1796763
Refactor DeepGit Lite to load environment variables, update API integration, and enhance user feedback
Browse files- __pycache__/main.cpython-311.pyc +0 -0
- app.py +10 -11
- main.py +280 -0
- src/__init__.py +43 -0
- src/__pycache__/__init__.cpython-311.pyc +0 -0
- src/__pycache__/deepgit_lite.cpython-311.pyc +0 -0
- src/deepgit_lite.py +103 -34
__pycache__/main.cpython-311.pyc
ADDED
Binary file (15.7 kB). View file
|
|
app.py
CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
|
|
2 |
import time
|
3 |
import threading
|
4 |
import logging
|
5 |
-
from
|
6 |
|
7 |
# ---------------------------
|
8 |
# Global Logging Buffer Setup
|
@@ -59,7 +59,7 @@ title = """
|
|
59 |
"""
|
60 |
|
61 |
description = """<p align="center">
|
62 |
-
DeepGit Lite is a streamlined
|
63 |
</p>"""
|
64 |
|
65 |
consent_text = """
|
@@ -81,7 +81,7 @@ footer = """
|
|
81 |
"""
|
82 |
|
83 |
# ---------------------------
|
84 |
-
# HTML Table Renderer for
|
85 |
# ---------------------------
|
86 |
def format_percent(value):
|
87 |
try:
|
@@ -116,8 +116,7 @@ def parse_result_to_html(raw_result: str) -> str:
|
|
116 |
<th>Rank</th>
|
117 |
<th>Title</th>
|
118 |
<th>Link</th>
|
119 |
-
<th>
|
120 |
-
<th>Final Score</th>
|
121 |
</tr>
|
122 |
</thead>
|
123 |
<tbody>
|
@@ -135,18 +134,17 @@ def parse_result_to_html(raw_result: str) -> str:
|
|
135 |
<td>{data.get('Final Rank', '')}</td>
|
136 |
<td>{data.get('Title', '')}</td>
|
137 |
<td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
|
138 |
-
<td>{
|
139 |
-
<td>{format_percent(data.get('Final Score', ''))}</td>
|
140 |
</tr>
|
141 |
"""
|
142 |
html += "</tbody></table>"
|
143 |
return html
|
144 |
|
145 |
# ---------------------------
|
146 |
-
# Background Workflow Runner
|
147 |
# ---------------------------
|
148 |
def run_lite_workflow(topic, result_container):
|
149 |
-
result =
|
150 |
result_container["raw_result"] = result
|
151 |
|
152 |
def stream_lite_workflow(topic):
|
@@ -180,7 +178,6 @@ def stream_lite_workflow(topic):
|
|
180 |
# App UI Setup for DeepGit Lite
|
181 |
# ---------------------------
|
182 |
with gr.Blocks(
|
183 |
-
theme="gstaff/sketch",
|
184 |
css="""
|
185 |
#main_container { margin: auto; max-width: 900px; }
|
186 |
footer, footer * { display: none !important; }
|
@@ -198,7 +195,7 @@ with gr.Blocks(
|
|
198 |
with gr.Column(elem_id="main_container", visible=False) as main_block:
|
199 |
research_input = gr.Textbox(
|
200 |
label="Research Topic",
|
201 |
-
placeholder="Enter your research topic here, e.g., '
|
202 |
lines=3
|
203 |
)
|
204 |
run_button = gr.Button("Run DeepGit Lite", variant="primary")
|
@@ -212,7 +209,9 @@ with gr.Blocks(
|
|
212 |
|
213 |
agree_button.click(fn=enable_main, inputs=[], outputs=[consent_block, main_block], queue=False)
|
214 |
|
|
|
215 |
def lite_runner(topic):
|
|
|
216 |
for status, details in stream_lite_workflow(topic):
|
217 |
yield status, details
|
218 |
|
|
|
2 |
import time
|
3 |
import threading
|
4 |
import logging
|
5 |
+
from main import run_repository_ranking # Import the new function from main.py
|
6 |
|
7 |
# ---------------------------
|
8 |
# Global Logging Buffer Setup
|
|
|
59 |
"""
|
60 |
|
61 |
description = """<p align="center">
|
62 |
+
DeepGit Lite is a streamlined tool for semantic search on GitHub repositories. It retrieves repositories using dense retrieval, ranks them by similarity, and then presents the top results.
|
63 |
</p>"""
|
64 |
|
65 |
consent_text = """
|
|
|
81 |
"""
|
82 |
|
83 |
# ---------------------------
|
84 |
+
# HTML Table Renderer for Results
|
85 |
# ---------------------------
|
86 |
def format_percent(value):
|
87 |
try:
|
|
|
116 |
<th>Rank</th>
|
117 |
<th>Title</th>
|
118 |
<th>Link</th>
|
119 |
+
<th>Combined Score</th>
|
|
|
120 |
</tr>
|
121 |
</thead>
|
122 |
<tbody>
|
|
|
134 |
<td>{data.get('Final Rank', '')}</td>
|
135 |
<td>{data.get('Title', '')}</td>
|
136 |
<td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
|
137 |
+
<td>{data.get('Combined Score', '')}</td>
|
|
|
138 |
</tr>
|
139 |
"""
|
140 |
html += "</tbody></table>"
|
141 |
return html
|
142 |
|
143 |
# ---------------------------
|
144 |
+
# Background Workflow Runner
|
145 |
# ---------------------------
|
146 |
def run_lite_workflow(topic, result_container):
|
147 |
+
result = run_repository_ranking(topic)
|
148 |
result_container["raw_result"] = result
|
149 |
|
150 |
def stream_lite_workflow(topic):
|
|
|
178 |
# App UI Setup for DeepGit Lite
|
179 |
# ---------------------------
|
180 |
with gr.Blocks(
|
|
|
181 |
css="""
|
182 |
#main_container { margin: auto; max-width: 900px; }
|
183 |
footer, footer * { display: none !important; }
|
|
|
195 |
with gr.Column(elem_id="main_container", visible=False) as main_block:
|
196 |
research_input = gr.Textbox(
|
197 |
label="Research Topic",
|
198 |
+
placeholder="Enter your research topic here, e.g., 'Fine tuning Instruction tuned LLama models...'",
|
199 |
lines=3
|
200 |
)
|
201 |
run_button = gr.Button("Run DeepGit Lite", variant="primary")
|
|
|
209 |
|
210 |
agree_button.click(fn=enable_main, inputs=[], outputs=[consent_block, main_block], queue=False)
|
211 |
|
212 |
+
# Added initial yield for immediate feedback when the button is pressed.
|
213 |
def lite_runner(topic):
|
214 |
+
yield "Workflow started", "<p>Processing your request. Please wait...</p>"
|
215 |
for status, details in stream_lite_workflow(topic):
|
216 |
yield status, details
|
217 |
|
main.py
ADDED
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
import requests
|
4 |
+
import numpy as np
|
5 |
+
import faiss
|
6 |
+
import re
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from pathlib import Path
|
10 |
+
from langchain_groq import ChatGroq
|
11 |
+
from langchain_core.prompts import ChatPromptTemplate
|
12 |
+
|
13 |
+
# Optionally import BM25 for sparse retrieval.
|
14 |
+
try:
|
15 |
+
from rank_bm25 import BM25Okapi
|
16 |
+
except ImportError:
|
17 |
+
BM25Okapi = None
|
18 |
+
|
19 |
+
# ---------------------------
|
20 |
+
# Environment Setup
|
21 |
+
# ---------------------------
|
22 |
+
load_dotenv()
|
23 |
+
|
24 |
+
# Setup a persistent session for GitHub API requests
|
25 |
+
session = requests.Session()
|
26 |
+
session.headers.update({
|
27 |
+
"Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
|
28 |
+
"Accept": "application/vnd.github.v3+json"
|
29 |
+
})
|
30 |
+
|
31 |
+
# ---------------------------
|
32 |
+
# Langchain Groq Setup
|
33 |
+
# ---------------------------
|
34 |
+
llm = ChatGroq(
|
35 |
+
model="deepseek-r1-distill-llama-70b",
|
36 |
+
temperature=0.3,
|
37 |
+
max_tokens=512,
|
38 |
+
max_retries=3,
|
39 |
+
)
|
40 |
+
prompt = ChatPromptTemplate.from_messages([
|
41 |
+
("system",
|
42 |
+
"""You are a GitHub search optimization expert.
|
43 |
+
|
44 |
+
Your job is to:
|
45 |
+
1. Read a user's query about tools, research, or tasks.
|
46 |
+
2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
|
47 |
+
3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
|
48 |
+
4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
|
49 |
+
Use as many tags as necessary based on the query's complexity, but never more than five.
|
50 |
+
5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
|
51 |
+
If no specific language is mentioned, do not include any target tag.
|
52 |
+
|
53 |
+
Output Format:
|
54 |
+
tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
|
55 |
+
|
56 |
+
Rules:
|
57 |
+
- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
|
58 |
+
- Use terms commonly found in GitHub repo names, topics, or descriptions.
|
59 |
+
- Avoid generic terms like "python", "ai", "tool", "project".
|
60 |
+
- Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
|
61 |
+
- Prefer real tools, popular methods, or dataset names when mentioned.
|
62 |
+
- If your output does not strictly match the required format, correct it after your internal reasoning.
|
63 |
+
- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
|
64 |
+
|
65 |
+
Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
|
66 |
+
"""),
|
67 |
+
("human", "{query}")
|
68 |
+
])
|
69 |
+
chain = prompt | llm
|
70 |
+
|
71 |
+
def parse_search_tags(response) -> str:
|
72 |
+
"""
|
73 |
+
Removes internal chain-of-thought (enclosed in <think> tags) and returns only the final search tags.
|
74 |
+
"""
|
75 |
+
response_str = str(response)
|
76 |
+
if "<think>" in response_str and "</think>" in response_str:
|
77 |
+
end_index = response_str.index("</think>") + len("</think>")
|
78 |
+
tags = response_str[end_index:].strip()
|
79 |
+
return tags
|
80 |
+
else:
|
81 |
+
return response_str.strip()
|
82 |
+
|
83 |
+
def valid_tags(tags: str) -> bool:
|
84 |
+
"""
|
85 |
+
Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
|
86 |
+
"""
|
87 |
+
pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
|
88 |
+
return re.match(pattern, tags) is not None
|
89 |
+
|
90 |
+
def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
|
91 |
+
print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
|
92 |
+
refined_query = query
|
93 |
+
tags_output = ""
|
94 |
+
for iteration in range(max_iterations):
|
95 |
+
print(f"\n🔄 Iteration {iteration+1}")
|
96 |
+
response = chain.invoke({"query": refined_query})
|
97 |
+
full_output = response.content.strip()
|
98 |
+
tags_output = parse_search_tags(full_output)
|
99 |
+
print(f"Output Tags: {tags_output}")
|
100 |
+
if valid_tags(tags_output):
|
101 |
+
print("✅ Valid tags format detected.")
|
102 |
+
return tags_output
|
103 |
+
else:
|
104 |
+
print("⚠️ Invalid tags format. Requesting refinement...")
|
105 |
+
refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
|
106 |
+
print("Final output (may be invalid):", tags_output)
|
107 |
+
return tags_output
|
108 |
+
|
109 |
+
# ---------------------------
|
110 |
+
# GitHub API Helper Functions
|
111 |
+
# ---------------------------
|
112 |
+
def fetch_readme_content(repo_full_name):
|
113 |
+
readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
|
114 |
+
response = session.get(readme_url)
|
115 |
+
if response.status_code == 200:
|
116 |
+
readme_data = response.json()
|
117 |
+
try:
|
118 |
+
return base64.b64decode(readme_data.get('content', '')).decode('utf-8', errors='replace')
|
119 |
+
except Exception:
|
120 |
+
return ""
|
121 |
+
return ""
|
122 |
+
|
123 |
+
def fetch_github_repositories(query, max_results=10):
|
124 |
+
"""
|
125 |
+
Searches GitHub repositories using the provided query and retrieves key information.
|
126 |
+
"""
|
127 |
+
url = "https://api.github.com/search/repositories"
|
128 |
+
params = {
|
129 |
+
"q": query,
|
130 |
+
"per_page": max_results
|
131 |
+
}
|
132 |
+
response = session.get(url, params=params)
|
133 |
+
if response.status_code != 200:
|
134 |
+
print(f"Error {response.status_code}: {response.json().get('message')}")
|
135 |
+
return []
|
136 |
+
repo_list = []
|
137 |
+
for repo in response.json().get('items', []):
|
138 |
+
repo_link = repo.get('html_url')
|
139 |
+
description = repo.get('description') or ""
|
140 |
+
readme_content = fetch_readme_content(repo.get('full_name'))
|
141 |
+
# Combine description and README for a richer document context.
|
142 |
+
combined_text = (description + "\n" + readme_content).strip()
|
143 |
+
repo_list.append({
|
144 |
+
"title": repo.get('name', 'No title available'),
|
145 |
+
"link": repo_link,
|
146 |
+
"combined_text": combined_text
|
147 |
+
})
|
148 |
+
return repo_list
|
149 |
+
|
150 |
+
# ---------------------------
|
151 |
+
# Initialize SentenceTransformer Model
|
152 |
+
# ---------------------------
|
153 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
154 |
+
|
155 |
+
def robust_min_max_norm(scores):
|
156 |
+
"""
|
157 |
+
Performs min-max normalization while avoiding division by zero.
|
158 |
+
"""
|
159 |
+
min_val = scores.min()
|
160 |
+
max_val = scores.max()
|
161 |
+
if max_val - min_val < 1e-10:
|
162 |
+
return np.ones_like(scores)
|
163 |
+
return (scores - min_val) / (max_val - min_val)
|
164 |
+
|
165 |
+
# ---------------------------
|
166 |
+
# Main Function: Repository Ranking with Hybrid Retrieval
|
167 |
+
# ---------------------------
|
168 |
+
def run_repository_ranking(query: str) -> str:
|
169 |
+
"""
|
170 |
+
Converts the user query into search tags, runs multiple GitHub queries (individual and combined),
|
171 |
+
deduplicates results, and applies hybrid dense (FAISS) and sparse (BM25) ranking.
|
172 |
+
"""
|
173 |
+
# Step 1: Generate search tags from the query.
|
174 |
+
search_tags = iterative_convert_to_search_tags(query)
|
175 |
+
tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
|
176 |
+
|
177 |
+
# Step 2: Handle target language extraction.
|
178 |
+
target_lang = None
|
179 |
+
if any(tag.startswith("target-") for tag in tag_list):
|
180 |
+
target_tag = next(tag for tag in tag_list if tag.startswith("target-"))
|
181 |
+
target_lang = target_tag.replace("target-", "")
|
182 |
+
lang_query = f"language:{target_lang}"
|
183 |
+
tag_list = [tag for tag in tag_list if not tag.startswith("target-")]
|
184 |
+
else:
|
185 |
+
lang_query = "language:python"
|
186 |
+
|
187 |
+
# Step 3: Build advanced search qualifiers.
|
188 |
+
advanced_qualifier = "in:name,description,readme"
|
189 |
+
all_repositories = []
|
190 |
+
|
191 |
+
# Loop over individual tags.
|
192 |
+
for tag in tag_list:
|
193 |
+
github_query = f"{tag} {advanced_qualifier} {lang_query}"
|
194 |
+
print("GitHub Query:", github_query)
|
195 |
+
repos = fetch_github_repositories(github_query, max_results=15)
|
196 |
+
all_repositories.extend(repos)
|
197 |
+
|
198 |
+
# Also perform a combined query using OR logic for higher recall.
|
199 |
+
combined_query = " OR ".join(tag_list)
|
200 |
+
combined_query = f"({combined_query}) {advanced_qualifier} {lang_query}"
|
201 |
+
print("Combined GitHub Query:", combined_query)
|
202 |
+
repos = fetch_github_repositories(combined_query, max_results=15)
|
203 |
+
all_repositories.extend(repos)
|
204 |
+
|
205 |
+
# Deduplicate repositories using the repo link.
|
206 |
+
unique_repositories = {}
|
207 |
+
for repo in all_repositories:
|
208 |
+
if repo["link"] not in unique_repositories:
|
209 |
+
unique_repositories[repo["link"]] = repo
|
210 |
+
else:
|
211 |
+
# Merge content if the repository appears in multiple queries.
|
212 |
+
existing_text = unique_repositories[repo["link"]]["combined_text"]
|
213 |
+
unique_repositories[repo["link"]]["combined_text"] = existing_text + "\n" + repo["combined_text"]
|
214 |
+
repositories = list(unique_repositories.values())
|
215 |
+
|
216 |
+
if not repositories:
|
217 |
+
return "No repositories found for your query."
|
218 |
+
|
219 |
+
# Step 4: Prepare documents by using the combined text (description + README).
|
220 |
+
docs = [repo.get("combined_text", "") for repo in repositories]
|
221 |
+
|
222 |
+
# Step 5: Compute dense embeddings and build the FAISS index.
|
223 |
+
doc_embeddings = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
|
224 |
+
if doc_embeddings.ndim == 1:
|
225 |
+
doc_embeddings = doc_embeddings.reshape(1, -1)
|
226 |
+
norms = np.linalg.norm(doc_embeddings, axis=1, keepdims=True)
|
227 |
+
norm_doc_embeddings = doc_embeddings / (norms + 1e-10)
|
228 |
+
|
229 |
+
query_embedding = model.encode(query, convert_to_numpy=True)
|
230 |
+
if query_embedding.ndim == 1:
|
231 |
+
query_embedding = query_embedding.reshape(1, -1)
|
232 |
+
norm_query_embedding = query_embedding / (np.linalg.norm(query_embedding) + 1e-10)
|
233 |
+
|
234 |
+
dim = norm_doc_embeddings.shape[1]
|
235 |
+
index = faiss.IndexFlatIP(dim)
|
236 |
+
index.add(norm_doc_embeddings)
|
237 |
+
k = norm_doc_embeddings.shape[0]
|
238 |
+
D, I = index.search(norm_query_embedding, k)
|
239 |
+
dense_scores = D.squeeze()
|
240 |
+
norm_dense_scores = robust_min_max_norm(dense_scores)
|
241 |
+
|
242 |
+
# Step 6: Compute BM25 scores with improved tokenization.
|
243 |
+
if BM25Okapi is not None:
|
244 |
+
tokenized_docs = [re.findall(r'\w+', doc.lower()) for doc in docs]
|
245 |
+
bm25 = BM25Okapi(tokenized_docs)
|
246 |
+
query_tokens = re.findall(r'\w+', query.lower())
|
247 |
+
bm25_scores = np.array(bm25.get_scores(query_tokens))
|
248 |
+
norm_bm25_scores = robust_min_max_norm(bm25_scores)
|
249 |
+
else:
|
250 |
+
norm_bm25_scores = np.zeros_like(norm_dense_scores)
|
251 |
+
|
252 |
+
# Step 7: Combine scores (with denser retrieval given higher weight).
|
253 |
+
alpha = 0.8 # Weight for dense retrieval
|
254 |
+
combined_scores = alpha * norm_dense_scores + (1 - alpha) * norm_bm25_scores
|
255 |
+
|
256 |
+
for idx, repo in enumerate(repositories):
|
257 |
+
repo["combined_score"] = float(combined_scores[idx])
|
258 |
+
|
259 |
+
# Step 8: Rank repositories and format output.
|
260 |
+
ranked_repositories = sorted(repositories, key=lambda x: x.get("combined_score", 0), reverse=True)
|
261 |
+
|
262 |
+
output = "\n=== Ranked Repositories ===\n"
|
263 |
+
for rank, repo in enumerate(ranked_repositories, 1):
|
264 |
+
output += f"Final Rank: {rank}\n"
|
265 |
+
output += f"Title: {repo['title']}\n"
|
266 |
+
output += f"Link: {repo['link']}\n"
|
267 |
+
output += f"Combined Score: {repo.get('combined_score', 0):.4f}\n"
|
268 |
+
snippet = repo['combined_text'][:300].replace('\n', ' ')
|
269 |
+
output += f"Snippet: {snippet}...\n"
|
270 |
+
output += '-' * 80 + "\n"
|
271 |
+
output += "\n=== End of Results ==="
|
272 |
+
return output
|
273 |
+
|
274 |
+
# ---------------------------
|
275 |
+
# Main Entry Point for Testing
|
276 |
+
# ---------------------------
|
277 |
+
if __name__ == "__main__":
|
278 |
+
test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
|
279 |
+
result = run_repository_ranking(test_query)
|
280 |
+
print(result)
|
src/__init__.py
CHANGED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import os
|
4 |
+
import requests
|
5 |
+
|
6 |
+
# Load .env from the project root
|
7 |
+
dotenv_path = Path(__file__).resolve().parents[1] / ".env"
|
8 |
+
if dotenv_path.exists():
|
9 |
+
load_dotenv(dotenv_path=dotenv_path)
|
10 |
+
|
11 |
+
# Get GitHub API key from environment
|
12 |
+
GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")
|
13 |
+
if not GITHUB_API_KEY:
|
14 |
+
raise ValueError("GITHUB_API_KEY not found in environment variables.")
|
15 |
+
|
16 |
+
# Rest of your code remains the same...
|
17 |
+
|
18 |
+
|
19 |
+
# Set up the GitHub Search API URL and headers.
|
20 |
+
url = "https://api.github.com/search/repositories"
|
21 |
+
headers = {
|
22 |
+
"Authorization": f"token {GITHUB_API_KEY}",
|
23 |
+
"Accept": "application/vnd.github.v3+json"
|
24 |
+
}
|
25 |
+
|
26 |
+
# Define a simple query.
|
27 |
+
query = "data augmentation language:python"
|
28 |
+
params = {
|
29 |
+
"q": query,
|
30 |
+
"sort": "stars",
|
31 |
+
"order": "desc",
|
32 |
+
"per_page": 10 # adjust the number of results as needed
|
33 |
+
}
|
34 |
+
|
35 |
+
response = requests.get(url, headers=headers, params=params)
|
36 |
+
if response.status_code == 200:
|
37 |
+
data = response.json()
|
38 |
+
items = data.get("items", [])
|
39 |
+
print(f"Found {len(items)} repositories:")
|
40 |
+
for repo in items:
|
41 |
+
print(f"- {repo['full_name']}: {repo['html_url']}")
|
42 |
+
else:
|
43 |
+
print(f"Error {response.status_code}: {response.json().get('message')}")
|
src/__pycache__/__init__.cpython-311.pyc
DELETED
Binary file (215 Bytes)
|
|
src/__pycache__/deepgit_lite.cpython-311.pyc
DELETED
Binary file (16.7 kB)
|
|
src/deepgit_lite.py
CHANGED
@@ -11,16 +11,22 @@ from dotenv import load_dotenv
|
|
11 |
from pathlib import Path
|
12 |
from langchain_groq import ChatGroq
|
13 |
from langchain_core.prompts import ChatPromptTemplate
|
|
|
|
|
14 |
|
15 |
# ---------------------------
|
16 |
# Environment and .env Setup
|
17 |
# ---------------------------
|
18 |
-
dotenv_path = Path(__file__).resolve().
|
19 |
-
|
|
|
20 |
|
21 |
if "GITHUB_API_KEY" not in os.environ:
|
22 |
raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
|
23 |
|
|
|
|
|
|
|
24 |
# ---------------------------
|
25 |
# Logging Setup
|
26 |
# ---------------------------
|
@@ -28,31 +34,86 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
|
|
28 |
logger = logging.getLogger(__name__)
|
29 |
|
30 |
# ---------------------------
|
31 |
-
# ChatGroq Integration Setup (for query
|
32 |
# ---------------------------
|
33 |
llm_groq = ChatGroq(
|
34 |
-
model="
|
35 |
temperature=0.2,
|
36 |
-
max_tokens=
|
37 |
timeout=15,
|
38 |
max_retries=2
|
39 |
)
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
Provide the refined query text."""
|
46 |
-
messages = [
|
47 |
-
("system", "You are a helpful research assistant specializing in AI and software research."),
|
48 |
-
("human", prompt)
|
49 |
-
]
|
50 |
-
result = llm_groq.invoke(messages)
|
51 |
-
# Extract text content if available
|
52 |
-
if hasattr(result, "content"):
|
53 |
-
return result.content
|
54 |
-
return str(result)
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def justify_candidate(candidate, query):
|
57 |
prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
|
58 |
|
@@ -104,11 +165,9 @@ def fetch_directory_markdown(repo_full_name, path, headers):
|
|
104 |
|
105 |
def fetch_repo_documentation(repo_full_name, headers):
|
106 |
doc_text = ""
|
107 |
-
# Fetch README first.
|
108 |
readme = fetch_readme_content(repo_full_name, headers)
|
109 |
if readme:
|
110 |
doc_text += "# README\n" + readme
|
111 |
-
# Fetch additional markdown files and documentation directories.
|
112 |
root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
|
113 |
response = requests.get(root_url, headers=headers)
|
114 |
if response.status_code == 200:
|
@@ -165,27 +224,33 @@ def fetch_github_repositories(query, max_results=1000, per_page=100):
|
|
165 |
# Main Lite Workflow Function
|
166 |
# ---------------------------
|
167 |
def run_deepgit_lite(user_query):
|
168 |
-
# Stage 0: Query
|
169 |
-
logger.info("
|
170 |
original_query = user_query.strip()
|
171 |
-
|
172 |
-
logger.info(f"
|
173 |
-
|
|
|
|
|
174 |
logger.info(f"Using GitHub query: {github_query}")
|
175 |
|
176 |
-
# Stage 1: Dense Retrieval with FAISS
|
177 |
logger.info("Fetching repositories from GitHub...")
|
178 |
repos = fetch_github_repositories(github_query)
|
179 |
if not repos:
|
180 |
-
logger.
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
183 |
docs = [repo.get("combined_doc", "") for repo in repos]
|
184 |
logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
|
185 |
-
sem_model = SentenceTransformer("all-mpnet-base-v2")
|
186 |
doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
|
187 |
|
188 |
-
# Check if embeddings array is empty or 1-dimensional
|
189 |
if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
|
190 |
logger.error("No document embeddings generated. Aborting dense retrieval.")
|
191 |
return "\nFailed to generate document embeddings. Please try again."
|
@@ -210,7 +275,7 @@ def run_deepgit_lite(user_query):
|
|
210 |
# Stage 2: Filtering Low-Star Repositories
|
211 |
filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
|
212 |
if not filtered_candidates:
|
213 |
-
filtered_candidates = ranked_by_semantic
|
214 |
logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
|
215 |
|
216 |
# Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
|
@@ -228,7 +293,6 @@ def run_deepgit_lite(user_query):
|
|
228 |
for repo in filtered_candidates:
|
229 |
norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
|
230 |
norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
|
231 |
-
# Weights: 60% semantic, 40% stars.
|
232 |
repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
|
233 |
|
234 |
final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
|
@@ -256,3 +320,8 @@ def run_deepgit_lite(user_query):
|
|
256 |
result_text += "\n=== End of Results ==="
|
257 |
|
258 |
return result_text
|
|
|
|
|
|
|
|
|
|
|
|
11 |
from pathlib import Path
|
12 |
from langchain_groq import ChatGroq
|
13 |
from langchain_core.prompts import ChatPromptTemplate
|
14 |
+
import re
|
15 |
+
import getpass
|
16 |
|
17 |
# ---------------------------
|
18 |
# Environment and .env Setup
|
19 |
# ---------------------------
|
20 |
+
dotenv_path = Path(__file__).resolve().parents[1] / ".env"
|
21 |
+
if dotenv_path.exists():
|
22 |
+
load_dotenv(dotenv_path=dotenv_path)
|
23 |
|
24 |
if "GITHUB_API_KEY" not in os.environ:
|
25 |
raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
|
26 |
|
27 |
+
# Optionally, silence bitsandbytes warnings if desired.
|
28 |
+
os.environ["BITSANDBYTES_NOWARN"] = "1"
|
29 |
+
|
30 |
# ---------------------------
|
31 |
# Logging Setup
|
32 |
# ---------------------------
|
|
|
34 |
logger = logging.getLogger(__name__)
|
35 |
|
36 |
# ---------------------------
|
37 |
+
# ChatGroq Integration Setup (for query conversion and final justification)
|
38 |
# ---------------------------
|
39 |
llm_groq = ChatGroq(
|
40 |
+
model="deepseek-r1-distill-llama-70b",
|
41 |
temperature=0.2,
|
42 |
+
max_tokens=800,
|
43 |
timeout=15,
|
44 |
max_retries=2
|
45 |
)
|
46 |
|
47 |
+
# --- Query Conversion Functions ---
|
48 |
+
prompt = ChatPromptTemplate.from_messages([
|
49 |
+
("system",
|
50 |
+
"""You are a GitHub search optimization expert.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
+
Your job is to:
|
53 |
+
1. Read a user's query about tools, research, or tasks.
|
54 |
+
2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
|
55 |
+
3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
|
56 |
+
4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
|
57 |
+
Use as many tags as necessary based on the query's complexity, but never more than five.
|
58 |
+
5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
|
59 |
+
If no specific language is mentioned, do not include any target tag.
|
60 |
+
|
61 |
+
Output Format:
|
62 |
+
tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
|
63 |
+
|
64 |
+
Rules:
|
65 |
+
- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
|
66 |
+
- Use terms commonly found in GitHub repo names, topics, or descriptions.
|
67 |
+
- Avoid generic terms like "python", "ai", "tool", "project".
|
68 |
+
- Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
|
69 |
+
- Prefer real tools, popular methods, or dataset names when mentioned.
|
70 |
+
- If your output does not strictly match the required format, correct it after your internal reasoning.
|
71 |
+
- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
|
72 |
+
|
73 |
+
Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
|
74 |
+
"""),
|
75 |
+
("human", "{query}")
|
76 |
+
])
|
77 |
+
chain = prompt | llm_groq
|
78 |
+
|
79 |
+
def parse_search_tags(response: str) -> str:
|
80 |
+
"""
|
81 |
+
Removes any internal commentary enclosed in <think> ... </think> tags using regex,
|
82 |
+
and returns only the final searchable tags.
|
83 |
+
"""
|
84 |
+
cleaned = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
|
85 |
+
return cleaned
|
86 |
+
|
87 |
+
def valid_tags(tags: str) -> bool:
|
88 |
+
"""
|
89 |
+
Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
|
90 |
+
"""
|
91 |
+
pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
|
92 |
+
return re.match(pattern, tags) is not None
|
93 |
+
|
94 |
+
def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
|
95 |
+
print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
|
96 |
+
refined_query = query
|
97 |
+
tags_output = ""
|
98 |
+
for iteration in range(max_iterations):
|
99 |
+
print(f"\n🔄 Iteration {iteration+1}")
|
100 |
+
response = chain.invoke({"query": refined_query})
|
101 |
+
full_output = response.content.strip()
|
102 |
+
tags_output = parse_search_tags(full_output)
|
103 |
+
print(f"Output Tags: {tags_output}")
|
104 |
+
if valid_tags(tags_output):
|
105 |
+
print("✅ Valid tags format detected.")
|
106 |
+
return tags_output
|
107 |
+
else:
|
108 |
+
print("⚠️ Invalid tags format. Requesting refinement...")
|
109 |
+
refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
|
110 |
+
print("Final output (may be invalid):", tags_output)
|
111 |
+
# Fallback default tags if output is still invalid
|
112 |
+
fallback = "data-augmentation:llm-fine-tuning"
|
113 |
+
print(f"Using fallback search tags: {fallback}")
|
114 |
+
return fallback
|
115 |
+
|
116 |
+
# --- Justification Function ---
|
117 |
def justify_candidate(candidate, query):
|
118 |
prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
|
119 |
|
|
|
165 |
|
166 |
def fetch_repo_documentation(repo_full_name, headers):
|
167 |
doc_text = ""
|
|
|
168 |
readme = fetch_readme_content(repo_full_name, headers)
|
169 |
if readme:
|
170 |
doc_text += "# README\n" + readme
|
|
|
171 |
root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
|
172 |
response = requests.get(root_url, headers=headers)
|
173 |
if response.status_code == 200:
|
|
|
224 |
# Main Lite Workflow Function
|
225 |
# ---------------------------
|
226 |
def run_deepgit_lite(user_query):
|
227 |
+
# Stage 0: Query Conversion using iterative_convert_to_search_tags
|
228 |
+
logger.info("Converting query to searchable tags...")
|
229 |
original_query = user_query.strip()
|
230 |
+
search_tags = iterative_convert_to_search_tags(original_query)
|
231 |
+
logger.info(f"Search Tags: {search_tags}")
|
232 |
+
# Convert colon-separated tags into a space-separated query string.
|
233 |
+
tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
|
234 |
+
github_query = " ".join(tag_list) + " language:python"
|
235 |
logger.info(f"Using GitHub query: {github_query}")
|
236 |
|
237 |
+
# Stage 1: Dense Retrieval with FAISS - Fetch repositories using the query.
|
238 |
logger.info("Fetching repositories from GitHub...")
|
239 |
repos = fetch_github_repositories(github_query)
|
240 |
if not repos:
|
241 |
+
logger.warning("No repositories found with converted query. Falling back to default query.")
|
242 |
+
fallback_query = "data augmentation language:python"
|
243 |
+
logger.info(f"Using fallback GitHub query: {fallback_query}")
|
244 |
+
repos = fetch_github_repositories(fallback_query)
|
245 |
+
if not repos:
|
246 |
+
logger.error("No repositories found with fallback query either.")
|
247 |
+
return "\nNo repositories found for your query. Please try a different query."
|
248 |
+
|
249 |
docs = [repo.get("combined_doc", "") for repo in repos]
|
250 |
logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
|
251 |
+
sem_model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
|
252 |
doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
|
253 |
|
|
|
254 |
if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
|
255 |
logger.error("No document embeddings generated. Aborting dense retrieval.")
|
256 |
return "\nFailed to generate document embeddings. Please try again."
|
|
|
275 |
# Stage 2: Filtering Low-Star Repositories
|
276 |
filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
|
277 |
if not filtered_candidates:
|
278 |
+
filtered_candidates = ranked_by_semantic
|
279 |
logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
|
280 |
|
281 |
# Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
|
|
|
293 |
for repo in filtered_candidates:
|
294 |
norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
|
295 |
norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
|
|
|
296 |
repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
|
297 |
|
298 |
final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
|
|
|
320 |
result_text += "\n=== End of Results ==="
|
321 |
|
322 |
return result_text
|
323 |
+
|
324 |
+
# For debugging: if run directly, execute with an example query.
|
325 |
+
if __name__ == "__main__":
|
326 |
+
test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
|
327 |
+
print(run_deepgit_lite(test_query))
|