zamalali commited on
Commit
777083e
·
1 Parent(s): 1796763

Refactor DeepGit Lite to load environment variables, update API integration, and enhance user feedback

Browse files
__pycache__/main.cpython-311.pyc ADDED
Binary file (15.7 kB). View file
 
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import time
3
  import threading
4
  import logging
5
- from src.deepgit_lite import run_deepgit_lite
6
 
7
  # ---------------------------
8
  # Global Logging Buffer Setup
@@ -59,7 +59,7 @@ title = """
59
  """
60
 
61
  description = """<p align="center">
62
- DeepGit Lite is a streamlined version of DeepGit designed for fast semantic search on GitHub repositories. It enhances your query, retrieves repositories using dense retrieval via FAISS, filters by star count, combines scores based on semantic similarity and popularity, and then provides a concise justification for the top results.
63
  </p>"""
64
 
65
  consent_text = """
@@ -81,7 +81,7 @@ footer = """
81
  """
82
 
83
  # ---------------------------
84
- # HTML Table Renderer for DeepGit Lite
85
  # ---------------------------
86
  def format_percent(value):
87
  try:
@@ -116,8 +116,7 @@ def parse_result_to_html(raw_result: str) -> str:
116
  <th>Rank</th>
117
  <th>Title</th>
118
  <th>Link</th>
119
- <th>Semantic Similarity</th>
120
- <th>Final Score</th>
121
  </tr>
122
  </thead>
123
  <tbody>
@@ -135,18 +134,17 @@ def parse_result_to_html(raw_result: str) -> str:
135
  <td>{data.get('Final Rank', '')}</td>
136
  <td>{data.get('Title', '')}</td>
137
  <td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
138
- <td>{format_percent(data.get('Semantic Similarity', ''))}</td>
139
- <td>{format_percent(data.get('Final Score', ''))}</td>
140
  </tr>
141
  """
142
  html += "</tbody></table>"
143
  return html
144
 
145
  # ---------------------------
146
- # Background Workflow Runner for DeepGit Lite
147
  # ---------------------------
148
  def run_lite_workflow(topic, result_container):
149
- result = run_deepgit_lite(topic)
150
  result_container["raw_result"] = result
151
 
152
  def stream_lite_workflow(topic):
@@ -180,7 +178,6 @@ def stream_lite_workflow(topic):
180
  # App UI Setup for DeepGit Lite
181
  # ---------------------------
182
  with gr.Blocks(
183
- theme="gstaff/sketch",
184
  css="""
185
  #main_container { margin: auto; max-width: 900px; }
186
  footer, footer * { display: none !important; }
@@ -198,7 +195,7 @@ with gr.Blocks(
198
  with gr.Column(elem_id="main_container", visible=False) as main_block:
199
  research_input = gr.Textbox(
200
  label="Research Topic",
201
- placeholder="Enter your research topic here, e.g., 'Instruction-based fine-tuning for LLaMA 2 using chain-of-thought prompting in Python.'",
202
  lines=3
203
  )
204
  run_button = gr.Button("Run DeepGit Lite", variant="primary")
@@ -212,7 +209,9 @@ with gr.Blocks(
212
 
213
  agree_button.click(fn=enable_main, inputs=[], outputs=[consent_block, main_block], queue=False)
214
 
 
215
  def lite_runner(topic):
 
216
  for status, details in stream_lite_workflow(topic):
217
  yield status, details
218
 
 
2
  import time
3
  import threading
4
  import logging
5
+ from main import run_repository_ranking # Import the new function from main.py
6
 
7
  # ---------------------------
8
  # Global Logging Buffer Setup
 
59
  """
60
 
61
  description = """<p align="center">
62
+ DeepGit Lite is a streamlined tool for semantic search on GitHub repositories. It retrieves repositories using dense retrieval, ranks them by similarity, and then presents the top results.
63
  </p>"""
64
 
65
  consent_text = """
 
81
  """
82
 
83
  # ---------------------------
84
+ # HTML Table Renderer for Results
85
  # ---------------------------
86
  def format_percent(value):
87
  try:
 
116
  <th>Rank</th>
117
  <th>Title</th>
118
  <th>Link</th>
119
+ <th>Combined Score</th>
 
120
  </tr>
121
  </thead>
122
  <tbody>
 
134
  <td>{data.get('Final Rank', '')}</td>
135
  <td>{data.get('Title', '')}</td>
136
  <td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
137
+ <td>{data.get('Combined Score', '')}</td>
 
138
  </tr>
139
  """
140
  html += "</tbody></table>"
141
  return html
142
 
143
  # ---------------------------
144
+ # Background Workflow Runner
145
  # ---------------------------
146
  def run_lite_workflow(topic, result_container):
147
+ result = run_repository_ranking(topic)
148
  result_container["raw_result"] = result
149
 
150
  def stream_lite_workflow(topic):
 
178
  # App UI Setup for DeepGit Lite
179
  # ---------------------------
180
  with gr.Blocks(
 
181
  css="""
182
  #main_container { margin: auto; max-width: 900px; }
183
  footer, footer * { display: none !important; }
 
195
  with gr.Column(elem_id="main_container", visible=False) as main_block:
196
  research_input = gr.Textbox(
197
  label="Research Topic",
198
+ placeholder="Enter your research topic here, e.g., 'Fine tuning Instruction tuned LLama models...'",
199
  lines=3
200
  )
201
  run_button = gr.Button("Run DeepGit Lite", variant="primary")
 
209
 
210
  agree_button.click(fn=enable_main, inputs=[], outputs=[consent_block, main_block], queue=False)
211
 
212
+ # Added initial yield for immediate feedback when the button is pressed.
213
  def lite_runner(topic):
214
+ yield "Workflow started", "<p>Processing your request. Please wait...</p>"
215
  for status, details in stream_lite_workflow(topic):
216
  yield status, details
217
 
main.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import requests
4
+ import numpy as np
5
+ import faiss
6
+ import re
7
+ from sentence_transformers import SentenceTransformer
8
+ from dotenv import load_dotenv
9
+ from pathlib import Path
10
+ from langchain_groq import ChatGroq
11
+ from langchain_core.prompts import ChatPromptTemplate
12
+
13
+ # Optionally import BM25 for sparse retrieval.
14
+ try:
15
+ from rank_bm25 import BM25Okapi
16
+ except ImportError:
17
+ BM25Okapi = None
18
+
19
+ # ---------------------------
20
+ # Environment Setup
21
+ # ---------------------------
22
+ load_dotenv()
23
+
24
+ # Setup a persistent session for GitHub API requests
25
+ session = requests.Session()
26
+ session.headers.update({
27
+ "Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
28
+ "Accept": "application/vnd.github.v3+json"
29
+ })
30
+
31
+ # ---------------------------
32
+ # Langchain Groq Setup
33
+ # ---------------------------
34
+ llm = ChatGroq(
35
+ model="deepseek-r1-distill-llama-70b",
36
+ temperature=0.3,
37
+ max_tokens=512,
38
+ max_retries=3,
39
+ )
40
+ prompt = ChatPromptTemplate.from_messages([
41
+ ("system",
42
+ """You are a GitHub search optimization expert.
43
+
44
+ Your job is to:
45
+ 1. Read a user's query about tools, research, or tasks.
46
+ 2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
47
+ 3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
48
+ 4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
49
+ Use as many tags as necessary based on the query's complexity, but never more than five.
50
+ 5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
51
+ If no specific language is mentioned, do not include any target tag.
52
+
53
+ Output Format:
54
+ tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
55
+
56
+ Rules:
57
+ - Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
58
+ - Use terms commonly found in GitHub repo names, topics, or descriptions.
59
+ - Avoid generic terms like "python", "ai", "tool", "project".
60
+ - Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
61
+ - Prefer real tools, popular methods, or dataset names when mentioned.
62
+ - If your output does not strictly match the required format, correct it after your internal reasoning.
63
+ - Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
64
+
65
+ Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
66
+ """),
67
+ ("human", "{query}")
68
+ ])
69
+ chain = prompt | llm
70
+
71
+ def parse_search_tags(response) -> str:
72
+ """
73
+ Removes internal chain-of-thought (enclosed in <think> tags) and returns only the final search tags.
74
+ """
75
+ response_str = str(response)
76
+ if "<think>" in response_str and "</think>" in response_str:
77
+ end_index = response_str.index("</think>") + len("</think>")
78
+ tags = response_str[end_index:].strip()
79
+ return tags
80
+ else:
81
+ return response_str.strip()
82
+
83
+ def valid_tags(tags: str) -> bool:
84
+ """
85
+ Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
86
+ """
87
+ pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
88
+ return re.match(pattern, tags) is not None
89
+
90
+ def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
91
+ print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
92
+ refined_query = query
93
+ tags_output = ""
94
+ for iteration in range(max_iterations):
95
+ print(f"\n🔄 Iteration {iteration+1}")
96
+ response = chain.invoke({"query": refined_query})
97
+ full_output = response.content.strip()
98
+ tags_output = parse_search_tags(full_output)
99
+ print(f"Output Tags: {tags_output}")
100
+ if valid_tags(tags_output):
101
+ print("✅ Valid tags format detected.")
102
+ return tags_output
103
+ else:
104
+ print("⚠️ Invalid tags format. Requesting refinement...")
105
+ refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
106
+ print("Final output (may be invalid):", tags_output)
107
+ return tags_output
108
+
109
+ # ---------------------------
110
+ # GitHub API Helper Functions
111
+ # ---------------------------
112
+ def fetch_readme_content(repo_full_name):
113
+ readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
114
+ response = session.get(readme_url)
115
+ if response.status_code == 200:
116
+ readme_data = response.json()
117
+ try:
118
+ return base64.b64decode(readme_data.get('content', '')).decode('utf-8', errors='replace')
119
+ except Exception:
120
+ return ""
121
+ return ""
122
+
123
+ def fetch_github_repositories(query, max_results=10):
124
+ """
125
+ Searches GitHub repositories using the provided query and retrieves key information.
126
+ """
127
+ url = "https://api.github.com/search/repositories"
128
+ params = {
129
+ "q": query,
130
+ "per_page": max_results
131
+ }
132
+ response = session.get(url, params=params)
133
+ if response.status_code != 200:
134
+ print(f"Error {response.status_code}: {response.json().get('message')}")
135
+ return []
136
+ repo_list = []
137
+ for repo in response.json().get('items', []):
138
+ repo_link = repo.get('html_url')
139
+ description = repo.get('description') or ""
140
+ readme_content = fetch_readme_content(repo.get('full_name'))
141
+ # Combine description and README for a richer document context.
142
+ combined_text = (description + "\n" + readme_content).strip()
143
+ repo_list.append({
144
+ "title": repo.get('name', 'No title available'),
145
+ "link": repo_link,
146
+ "combined_text": combined_text
147
+ })
148
+ return repo_list
149
+
150
+ # ---------------------------
151
+ # Initialize SentenceTransformer Model
152
+ # ---------------------------
153
+ model = SentenceTransformer('all-MiniLM-L6-v2')
154
+
155
+ def robust_min_max_norm(scores):
156
+ """
157
+ Performs min-max normalization while avoiding division by zero.
158
+ """
159
+ min_val = scores.min()
160
+ max_val = scores.max()
161
+ if max_val - min_val < 1e-10:
162
+ return np.ones_like(scores)
163
+ return (scores - min_val) / (max_val - min_val)
164
+
165
+ # ---------------------------
166
+ # Main Function: Repository Ranking with Hybrid Retrieval
167
+ # ---------------------------
168
+ def run_repository_ranking(query: str) -> str:
169
+ """
170
+ Converts the user query into search tags, runs multiple GitHub queries (individual and combined),
171
+ deduplicates results, and applies hybrid dense (FAISS) and sparse (BM25) ranking.
172
+ """
173
+ # Step 1: Generate search tags from the query.
174
+ search_tags = iterative_convert_to_search_tags(query)
175
+ tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
176
+
177
+ # Step 2: Handle target language extraction.
178
+ target_lang = None
179
+ if any(tag.startswith("target-") for tag in tag_list):
180
+ target_tag = next(tag for tag in tag_list if tag.startswith("target-"))
181
+ target_lang = target_tag.replace("target-", "")
182
+ lang_query = f"language:{target_lang}"
183
+ tag_list = [tag for tag in tag_list if not tag.startswith("target-")]
184
+ else:
185
+ lang_query = "language:python"
186
+
187
+ # Step 3: Build advanced search qualifiers.
188
+ advanced_qualifier = "in:name,description,readme"
189
+ all_repositories = []
190
+
191
+ # Loop over individual tags.
192
+ for tag in tag_list:
193
+ github_query = f"{tag} {advanced_qualifier} {lang_query}"
194
+ print("GitHub Query:", github_query)
195
+ repos = fetch_github_repositories(github_query, max_results=15)
196
+ all_repositories.extend(repos)
197
+
198
+ # Also perform a combined query using OR logic for higher recall.
199
+ combined_query = " OR ".join(tag_list)
200
+ combined_query = f"({combined_query}) {advanced_qualifier} {lang_query}"
201
+ print("Combined GitHub Query:", combined_query)
202
+ repos = fetch_github_repositories(combined_query, max_results=15)
203
+ all_repositories.extend(repos)
204
+
205
+ # Deduplicate repositories using the repo link.
206
+ unique_repositories = {}
207
+ for repo in all_repositories:
208
+ if repo["link"] not in unique_repositories:
209
+ unique_repositories[repo["link"]] = repo
210
+ else:
211
+ # Merge content if the repository appears in multiple queries.
212
+ existing_text = unique_repositories[repo["link"]]["combined_text"]
213
+ unique_repositories[repo["link"]]["combined_text"] = existing_text + "\n" + repo["combined_text"]
214
+ repositories = list(unique_repositories.values())
215
+
216
+ if not repositories:
217
+ return "No repositories found for your query."
218
+
219
+ # Step 4: Prepare documents by using the combined text (description + README).
220
+ docs = [repo.get("combined_text", "") for repo in repositories]
221
+
222
+ # Step 5: Compute dense embeddings and build the FAISS index.
223
+ doc_embeddings = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
224
+ if doc_embeddings.ndim == 1:
225
+ doc_embeddings = doc_embeddings.reshape(1, -1)
226
+ norms = np.linalg.norm(doc_embeddings, axis=1, keepdims=True)
227
+ norm_doc_embeddings = doc_embeddings / (norms + 1e-10)
228
+
229
+ query_embedding = model.encode(query, convert_to_numpy=True)
230
+ if query_embedding.ndim == 1:
231
+ query_embedding = query_embedding.reshape(1, -1)
232
+ norm_query_embedding = query_embedding / (np.linalg.norm(query_embedding) + 1e-10)
233
+
234
+ dim = norm_doc_embeddings.shape[1]
235
+ index = faiss.IndexFlatIP(dim)
236
+ index.add(norm_doc_embeddings)
237
+ k = norm_doc_embeddings.shape[0]
238
+ D, I = index.search(norm_query_embedding, k)
239
+ dense_scores = D.squeeze()
240
+ norm_dense_scores = robust_min_max_norm(dense_scores)
241
+
242
+ # Step 6: Compute BM25 scores with improved tokenization.
243
+ if BM25Okapi is not None:
244
+ tokenized_docs = [re.findall(r'\w+', doc.lower()) for doc in docs]
245
+ bm25 = BM25Okapi(tokenized_docs)
246
+ query_tokens = re.findall(r'\w+', query.lower())
247
+ bm25_scores = np.array(bm25.get_scores(query_tokens))
248
+ norm_bm25_scores = robust_min_max_norm(bm25_scores)
249
+ else:
250
+ norm_bm25_scores = np.zeros_like(norm_dense_scores)
251
+
252
+ # Step 7: Combine scores (with denser retrieval given higher weight).
253
+ alpha = 0.8 # Weight for dense retrieval
254
+ combined_scores = alpha * norm_dense_scores + (1 - alpha) * norm_bm25_scores
255
+
256
+ for idx, repo in enumerate(repositories):
257
+ repo["combined_score"] = float(combined_scores[idx])
258
+
259
+ # Step 8: Rank repositories and format output.
260
+ ranked_repositories = sorted(repositories, key=lambda x: x.get("combined_score", 0), reverse=True)
261
+
262
+ output = "\n=== Ranked Repositories ===\n"
263
+ for rank, repo in enumerate(ranked_repositories, 1):
264
+ output += f"Final Rank: {rank}\n"
265
+ output += f"Title: {repo['title']}\n"
266
+ output += f"Link: {repo['link']}\n"
267
+ output += f"Combined Score: {repo.get('combined_score', 0):.4f}\n"
268
+ snippet = repo['combined_text'][:300].replace('\n', ' ')
269
+ output += f"Snippet: {snippet}...\n"
270
+ output += '-' * 80 + "\n"
271
+ output += "\n=== End of Results ==="
272
+ return output
273
+
274
+ # ---------------------------
275
+ # Main Entry Point for Testing
276
+ # ---------------------------
277
+ if __name__ == "__main__":
278
+ test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
279
+ result = run_repository_ranking(test_query)
280
+ print(result)
src/__init__.py CHANGED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from dotenv import load_dotenv
3
+ import os
4
+ import requests
5
+
6
+ # Load .env from the project root
7
+ dotenv_path = Path(__file__).resolve().parents[1] / ".env"
8
+ if dotenv_path.exists():
9
+ load_dotenv(dotenv_path=dotenv_path)
10
+
11
+ # Get GitHub API key from environment
12
+ GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")
13
+ if not GITHUB_API_KEY:
14
+ raise ValueError("GITHUB_API_KEY not found in environment variables.")
15
+
16
+ # Rest of your code remains the same...
17
+
18
+
19
+ # Set up the GitHub Search API URL and headers.
20
+ url = "https://api.github.com/search/repositories"
21
+ headers = {
22
+ "Authorization": f"token {GITHUB_API_KEY}",
23
+ "Accept": "application/vnd.github.v3+json"
24
+ }
25
+
26
+ # Define a simple query.
27
+ query = "data augmentation language:python"
28
+ params = {
29
+ "q": query,
30
+ "sort": "stars",
31
+ "order": "desc",
32
+ "per_page": 10 # adjust the number of results as needed
33
+ }
34
+
35
+ response = requests.get(url, headers=headers, params=params)
36
+ if response.status_code == 200:
37
+ data = response.json()
38
+ items = data.get("items", [])
39
+ print(f"Found {len(items)} repositories:")
40
+ for repo in items:
41
+ print(f"- {repo['full_name']}: {repo['html_url']}")
42
+ else:
43
+ print(f"Error {response.status_code}: {response.json().get('message')}")
src/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (215 Bytes)
 
src/__pycache__/deepgit_lite.cpython-311.pyc DELETED
Binary file (16.7 kB)
 
src/deepgit_lite.py CHANGED
@@ -11,16 +11,22 @@ from dotenv import load_dotenv
11
  from pathlib import Path
12
  from langchain_groq import ChatGroq
13
  from langchain_core.prompts import ChatPromptTemplate
 
 
14
 
15
  # ---------------------------
16
  # Environment and .env Setup
17
  # ---------------------------
18
- dotenv_path = Path(__file__).resolve().parent.parent / ".env"
19
- load_dotenv(dotenv_path=str(dotenv_path))
 
20
 
21
  if "GITHUB_API_KEY" not in os.environ:
22
  raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
23
 
 
 
 
24
  # ---------------------------
25
  # Logging Setup
26
  # ---------------------------
@@ -28,31 +34,86 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
28
  logger = logging.getLogger(__name__)
29
 
30
  # ---------------------------
31
- # ChatGroq Integration Setup (for query enhancement and final justification)
32
  # ---------------------------
33
  llm_groq = ChatGroq(
34
- model="llama-3.1-8b-instant",
35
  temperature=0.2,
36
- max_tokens=100,
37
  timeout=15,
38
  max_retries=2
39
  )
40
 
41
- def enhance_query(original_query):
42
- prompt = f"""You are an expert research assistant. Given the query: "{original_query}",
43
- please enhance and expand it by adding relevant technical keywords, recent research context,
44
- and details specifically related to the application of Chain of Thought prompting in large language models within a Python environment.
45
- Provide the refined query text."""
46
- messages = [
47
- ("system", "You are a helpful research assistant specializing in AI and software research."),
48
- ("human", prompt)
49
- ]
50
- result = llm_groq.invoke(messages)
51
- # Extract text content if available
52
- if hasattr(result, "content"):
53
- return result.content
54
- return str(result)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def justify_candidate(candidate, query):
57
  prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
58
 
@@ -104,11 +165,9 @@ def fetch_directory_markdown(repo_full_name, path, headers):
104
 
105
  def fetch_repo_documentation(repo_full_name, headers):
106
  doc_text = ""
107
- # Fetch README first.
108
  readme = fetch_readme_content(repo_full_name, headers)
109
  if readme:
110
  doc_text += "# README\n" + readme
111
- # Fetch additional markdown files and documentation directories.
112
  root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
113
  response = requests.get(root_url, headers=headers)
114
  if response.status_code == 200:
@@ -165,27 +224,33 @@ def fetch_github_repositories(query, max_results=1000, per_page=100):
165
  # Main Lite Workflow Function
166
  # ---------------------------
167
  def run_deepgit_lite(user_query):
168
- # Stage 0: Query Enhancement using ChatGroq
169
- logger.info("Enhancing query using ChatGroq...")
170
  original_query = user_query.strip()
171
- enhanced_query = enhance_query(original_query)
172
- logger.info(f"Enhanced Query: {enhanced_query}")
173
- github_query = enhanced_query + " language:python"
 
 
174
  logger.info(f"Using GitHub query: {github_query}")
175
 
176
- # Stage 1: Dense Retrieval with FAISS
177
  logger.info("Fetching repositories from GitHub...")
178
  repos = fetch_github_repositories(github_query)
179
  if not repos:
180
- logger.error("No repositories found. Please refine your query.")
181
- return "\nNo repositories found for your query. Please try a different query."
182
-
 
 
 
 
 
183
  docs = [repo.get("combined_doc", "") for repo in repos]
184
  logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
185
- sem_model = SentenceTransformer("all-mpnet-base-v2")
186
  doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
187
 
188
- # Check if embeddings array is empty or 1-dimensional
189
  if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
190
  logger.error("No document embeddings generated. Aborting dense retrieval.")
191
  return "\nFailed to generate document embeddings. Please try again."
@@ -210,7 +275,7 @@ def run_deepgit_lite(user_query):
210
  # Stage 2: Filtering Low-Star Repositories
211
  filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
212
  if not filtered_candidates:
213
- filtered_candidates = ranked_by_semantic # fallback if filtering is too strict
214
  logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
215
 
216
  # Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
@@ -228,7 +293,6 @@ def run_deepgit_lite(user_query):
228
  for repo in filtered_candidates:
229
  norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
230
  norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
231
- # Weights: 60% semantic, 40% stars.
232
  repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
233
 
234
  final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
@@ -256,3 +320,8 @@ def run_deepgit_lite(user_query):
256
  result_text += "\n=== End of Results ==="
257
 
258
  return result_text
 
 
 
 
 
 
11
  from pathlib import Path
12
  from langchain_groq import ChatGroq
13
  from langchain_core.prompts import ChatPromptTemplate
14
+ import re
15
+ import getpass
16
 
17
  # ---------------------------
18
  # Environment and .env Setup
19
  # ---------------------------
20
+ dotenv_path = Path(__file__).resolve().parents[1] / ".env"
21
+ if dotenv_path.exists():
22
+ load_dotenv(dotenv_path=dotenv_path)
23
 
24
  if "GITHUB_API_KEY" not in os.environ:
25
  raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
26
 
27
+ # Optionally, silence bitsandbytes warnings if desired.
28
+ os.environ["BITSANDBYTES_NOWARN"] = "1"
29
+
30
  # ---------------------------
31
  # Logging Setup
32
  # ---------------------------
 
34
  logger = logging.getLogger(__name__)
35
 
36
  # ---------------------------
37
+ # ChatGroq Integration Setup (for query conversion and final justification)
38
  # ---------------------------
39
  llm_groq = ChatGroq(
40
+ model="deepseek-r1-distill-llama-70b",
41
  temperature=0.2,
42
+ max_tokens=800,
43
  timeout=15,
44
  max_retries=2
45
  )
46
 
47
+ # --- Query Conversion Functions ---
48
+ prompt = ChatPromptTemplate.from_messages([
49
+ ("system",
50
+ """You are a GitHub search optimization expert.
 
 
 
 
 
 
 
 
 
 
51
 
52
+ Your job is to:
53
+ 1. Read a user's query about tools, research, or tasks.
54
+ 2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
55
+ 3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
56
+ 4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
57
+ Use as many tags as necessary based on the query's complexity, but never more than five.
58
+ 5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
59
+ If no specific language is mentioned, do not include any target tag.
60
+
61
+ Output Format:
62
+ tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
63
+
64
+ Rules:
65
+ - Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
66
+ - Use terms commonly found in GitHub repo names, topics, or descriptions.
67
+ - Avoid generic terms like "python", "ai", "tool", "project".
68
+ - Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
69
+ - Prefer real tools, popular methods, or dataset names when mentioned.
70
+ - If your output does not strictly match the required format, correct it after your internal reasoning.
71
+ - Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
72
+
73
+ Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
74
+ """),
75
+ ("human", "{query}")
76
+ ])
77
+ chain = prompt | llm_groq
78
+
79
+ def parse_search_tags(response: str) -> str:
80
+ """
81
+ Removes any internal commentary enclosed in <think> ... </think> tags using regex,
82
+ and returns only the final searchable tags.
83
+ """
84
+ cleaned = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
85
+ return cleaned
86
+
87
+ def valid_tags(tags: str) -> bool:
88
+ """
89
+ Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
90
+ """
91
+ pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
92
+ return re.match(pattern, tags) is not None
93
+
94
+ def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
95
+ print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
96
+ refined_query = query
97
+ tags_output = ""
98
+ for iteration in range(max_iterations):
99
+ print(f"\n🔄 Iteration {iteration+1}")
100
+ response = chain.invoke({"query": refined_query})
101
+ full_output = response.content.strip()
102
+ tags_output = parse_search_tags(full_output)
103
+ print(f"Output Tags: {tags_output}")
104
+ if valid_tags(tags_output):
105
+ print("✅ Valid tags format detected.")
106
+ return tags_output
107
+ else:
108
+ print("⚠️ Invalid tags format. Requesting refinement...")
109
+ refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
110
+ print("Final output (may be invalid):", tags_output)
111
+ # Fallback default tags if output is still invalid
112
+ fallback = "data-augmentation:llm-fine-tuning"
113
+ print(f"Using fallback search tags: {fallback}")
114
+ return fallback
115
+
116
+ # --- Justification Function ---
117
  def justify_candidate(candidate, query):
118
  prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
119
 
 
165
 
166
  def fetch_repo_documentation(repo_full_name, headers):
167
  doc_text = ""
 
168
  readme = fetch_readme_content(repo_full_name, headers)
169
  if readme:
170
  doc_text += "# README\n" + readme
 
171
  root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
172
  response = requests.get(root_url, headers=headers)
173
  if response.status_code == 200:
 
224
  # Main Lite Workflow Function
225
  # ---------------------------
226
  def run_deepgit_lite(user_query):
227
+ # Stage 0: Query Conversion using iterative_convert_to_search_tags
228
+ logger.info("Converting query to searchable tags...")
229
  original_query = user_query.strip()
230
+ search_tags = iterative_convert_to_search_tags(original_query)
231
+ logger.info(f"Search Tags: {search_tags}")
232
+ # Convert colon-separated tags into a space-separated query string.
233
+ tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
234
+ github_query = " ".join(tag_list) + " language:python"
235
  logger.info(f"Using GitHub query: {github_query}")
236
 
237
+ # Stage 1: Dense Retrieval with FAISS - Fetch repositories using the query.
238
  logger.info("Fetching repositories from GitHub...")
239
  repos = fetch_github_repositories(github_query)
240
  if not repos:
241
+ logger.warning("No repositories found with converted query. Falling back to default query.")
242
+ fallback_query = "data augmentation language:python"
243
+ logger.info(f"Using fallback GitHub query: {fallback_query}")
244
+ repos = fetch_github_repositories(fallback_query)
245
+ if not repos:
246
+ logger.error("No repositories found with fallback query either.")
247
+ return "\nNo repositories found for your query. Please try a different query."
248
+
249
  docs = [repo.get("combined_doc", "") for repo in repos]
250
  logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
251
+ sem_model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
252
  doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
253
 
 
254
  if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
255
  logger.error("No document embeddings generated. Aborting dense retrieval.")
256
  return "\nFailed to generate document embeddings. Please try again."
 
275
  # Stage 2: Filtering Low-Star Repositories
276
  filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
277
  if not filtered_candidates:
278
+ filtered_candidates = ranked_by_semantic
279
  logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
280
 
281
  # Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
 
293
  for repo in filtered_candidates:
294
  norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
295
  norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
 
296
  repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
297
 
298
  final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
 
320
  result_text += "\n=== End of Results ==="
321
 
322
  return result_text
323
+
324
+ # For debugging: if run directly, execute with an example query.
325
+ if __name__ == "__main__":
326
+ test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
327
+ print(run_deepgit_lite(test_query))