Spaces:

zamal
/

DeepGit-lite

Running on Zero

App Files Files Community

zamalali commited on Mar 31

Commit

777083e

1 Parent(s): 1796763

Refactor DeepGit Lite to load environment variables, update API integration, and enhance user feedback

Browse files

Files changed (7) hide show

__pycache__/main.cpython-311.pyc +0 -0
app.py +10 -11
main.py +280 -0
src/__init__.py +43 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/deepgit_lite.cpython-311.pyc +0 -0
src/deepgit_lite.py +103 -34

__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (15.7 kB). View file

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import time
 import threading
 import logging
-from src.deepgit_lite import run_deepgit_lite
 # ---------------------------
 # Global Logging Buffer Setup
@@ -59,7 +59,7 @@ title = """
 """
 description = """<p align="center">
-DeepGit Lite is a streamlined version of DeepGit designed for fast semantic search on GitHub repositories. It enhances your query, retrieves repositories using dense retrieval via FAISS, filters by star count, combines scores based on semantic similarity and popularity, and then provides a concise justification for the top results.
 </p>"""
 consent_text = """
@@ -81,7 +81,7 @@ footer = """
 """
 # ---------------------------
-# HTML Table Renderer for DeepGit Lite
 # ---------------------------
 def format_percent(value):
     try:
@@ -116,8 +116,7 @@ def parse_result_to_html(raw_result: str) -> str:
                 <th>Rank</th>
                 <th>Title</th>
                 <th>Link</th>
-                <th>Semantic Similarity</th>
-                <th>Final Score</th>
             </tr>
         </thead>
         <tbody>
@@ -135,18 +134,17 @@ def parse_result_to_html(raw_result: str) -> str:
                 <td>{data.get('Final Rank', '')}</td>
                 <td>{data.get('Title', '')}</td>
                 <td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
-                <td>{format_percent(data.get('Semantic Similarity', ''))}</td>
-                <td>{format_percent(data.get('Final Score', ''))}</td>
             </tr>
         """
     html += "</tbody></table>"
     return html
 # ---------------------------
-# Background Workflow Runner for DeepGit Lite
 # ---------------------------
 def run_lite_workflow(topic, result_container):
-    result = run_deepgit_lite(topic)
     result_container["raw_result"] = result
 def stream_lite_workflow(topic):
@@ -180,7 +178,6 @@ def stream_lite_workflow(topic):
 # App UI Setup for DeepGit Lite
 # ---------------------------
 with gr.Blocks(
-    theme="gstaff/sketch",
     css="""
         #main_container { margin: auto; max-width: 900px; }
         footer, footer * { display: none !important; }
@@ -198,7 +195,7 @@ with gr.Blocks(
     with gr.Column(elem_id="main_container", visible=False) as main_block:
         research_input = gr.Textbox(
             label="Research Topic",
-            placeholder="Enter your research topic here, e.g., 'Instruction-based fine-tuning for LLaMA 2 using chain-of-thought prompting in Python.'",
             lines=3
         )
         run_button = gr.Button("Run DeepGit Lite", variant="primary")
@@ -212,7 +209,9 @@ with gr.Blocks(
     agree_button.click(fn=enable_main, inputs=[], outputs=[consent_block, main_block], queue=False)
     def lite_runner(topic):
         for status, details in stream_lite_workflow(topic):
             yield status, details

 import time
 import threading
 import logging
+from main import run_repository_ranking  # Import the new function from main.py
 # ---------------------------
 # Global Logging Buffer Setup
 """
 description = """<p align="center">
+DeepGit Lite is a streamlined tool for semantic search on GitHub repositories. It retrieves repositories using dense retrieval, ranks them by similarity, and then presents the top results.
 </p>"""
 consent_text = """
 """
 # ---------------------------
+# HTML Table Renderer for Results
 # ---------------------------
 def format_percent(value):
     try:
                 <th>Rank</th>
                 <th>Title</th>
                 <th>Link</th>
+                <th>Combined Score</th>
             </tr>
         </thead>
         <tbody>
                 <td>{data.get('Final Rank', '')}</td>
                 <td>{data.get('Title', '')}</td>
                 <td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
+                <td>{data.get('Combined Score', '')}</td>
             </tr>
         """
     html += "</tbody></table>"
     return html
 # ---------------------------
+# Background Workflow Runner
 # ---------------------------
 def run_lite_workflow(topic, result_container):
+    result = run_repository_ranking(topic)
     result_container["raw_result"] = result
 def stream_lite_workflow(topic):
 # App UI Setup for DeepGit Lite
 # ---------------------------
 with gr.Blocks(
     css="""
         #main_container { margin: auto; max-width: 900px; }
         footer, footer * { display: none !important; }
     with gr.Column(elem_id="main_container", visible=False) as main_block:
         research_input = gr.Textbox(
             label="Research Topic",
+            placeholder="Enter your research topic here, e.g., 'Fine tuning Instruction tuned LLama models...'",
             lines=3
         )
         run_button = gr.Button("Run DeepGit Lite", variant="primary")
     agree_button.click(fn=enable_main, inputs=[], outputs=[consent_block, main_block], queue=False)
+    # Added initial yield for immediate feedback when the button is pressed.
     def lite_runner(topic):
+        yield "Workflow started", "<p>Processing your request. Please wait...</p>"
         for status, details in stream_lite_workflow(topic):
             yield status, details

main.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import os
+import base64
+import requests
+import numpy as np
+import faiss
+import re
+from sentence_transformers import SentenceTransformer
+from dotenv import load_dotenv
+from pathlib import Path
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate
+# Optionally import BM25 for sparse retrieval.
+try:
+    from rank_bm25 import BM25Okapi
+except ImportError:
+    BM25Okapi = None
+# ---------------------------
+# Environment Setup
+# ---------------------------
+load_dotenv()
+# Setup a persistent session for GitHub API requests
+session = requests.Session()
+session.headers.update({
+    "Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
+    "Accept": "application/vnd.github.v3+json"
+})
+# ---------------------------
+# Langchain Groq Setup
+# ---------------------------
+llm = ChatGroq(
+    model="deepseek-r1-distill-llama-70b",
+    temperature=0.3,
+    max_tokens=512,
+    max_retries=3,
+)
+prompt = ChatPromptTemplate.from_messages([
+    ("system",
+     """You are a GitHub search optimization expert.
+Your job is to:
+1. Read a user's query about tools, research, or tasks.
+2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
+3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
+4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
+   Use as many tags as necessary based on the query's complexity, but never more than five.
+5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
+   If no specific language is mentioned, do not include any target tag.
+Output Format:
+tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
+Rules:
+- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
+- Use terms commonly found in GitHub repo names, topics, or descriptions.
+- Avoid generic terms like "python", "ai", "tool", "project".
+- Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
+- Prefer real tools, popular methods, or dataset names when mentioned.
+- If your output does not strictly match the required format, correct it after your internal reasoning.
+- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
+Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
+"""),
+    ("human", "{query}")
+])
+chain = prompt | llm
+def parse_search_tags(response) -> str:
+    """
+    Removes internal chain-of-thought (enclosed in <think> tags) and returns only the final search tags.
+    """
+    response_str = str(response)
+    if "<think>" in response_str and "</think>" in response_str:
+        end_index = response_str.index("</think>") + len("</think>")
+        tags = response_str[end_index:].strip()
+        return tags
+    else:
+        return response_str.strip()
+def valid_tags(tags: str) -> bool:
+    """
+    Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
+    """
+    pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
+    return re.match(pattern, tags) is not None
+def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
+    print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
+    refined_query = query
+    tags_output = ""
+    for iteration in range(max_iterations):
+        print(f"\n🔄 Iteration {iteration+1}")
+        response = chain.invoke({"query": refined_query})
+        full_output = response.content.strip()
+        tags_output = parse_search_tags(full_output)
+        print(f"Output Tags: {tags_output}")
+        if valid_tags(tags_output):
+            print("✅ Valid tags format detected.")
+            return tags_output
+        else:
+            print("⚠️ Invalid tags format. Requesting refinement...")
+            refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
+    print("Final output (may be invalid):", tags_output)
+    return tags_output
+# ---------------------------
+# GitHub API Helper Functions
+# ---------------------------
+def fetch_readme_content(repo_full_name):
+    readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
+    response = session.get(readme_url)
+    if response.status_code == 200:
+        readme_data = response.json()
+        try:
+            return base64.b64decode(readme_data.get('content', '')).decode('utf-8', errors='replace')
+        except Exception:
+            return ""
+    return ""
+def fetch_github_repositories(query, max_results=10):
+    """
+    Searches GitHub repositories using the provided query and retrieves key information.
+    """
+    url = "https://api.github.com/search/repositories"
+    params = {
+        "q": query,
+        "per_page": max_results
+    }
+    response = session.get(url, params=params)
+    if response.status_code != 200:
+        print(f"Error {response.status_code}: {response.json().get('message')}")
+        return []
+    repo_list = []
+    for repo in response.json().get('items', []):
+        repo_link = repo.get('html_url')
+        description = repo.get('description') or ""
+        readme_content = fetch_readme_content(repo.get('full_name'))
+        # Combine description and README for a richer document context.
+        combined_text = (description + "\n" + readme_content).strip()
+        repo_list.append({
+            "title": repo.get('name', 'No title available'),
+            "link": repo_link,
+            "combined_text": combined_text
+        })
+    return repo_list
+# ---------------------------
+# Initialize SentenceTransformer Model
+# ---------------------------
+model = SentenceTransformer('all-MiniLM-L6-v2')
+def robust_min_max_norm(scores):
+    """
+    Performs min-max normalization while avoiding division by zero.
+    """
+    min_val = scores.min()
+    max_val = scores.max()
+    if max_val - min_val < 1e-10:
+        return np.ones_like(scores)
+    return (scores - min_val) / (max_val - min_val)
+# ---------------------------
+# Main Function: Repository Ranking with Hybrid Retrieval
+# ---------------------------
+def run_repository_ranking(query: str) -> str:
+    """
+    Converts the user query into search tags, runs multiple GitHub queries (individual and combined),
+    deduplicates results, and applies hybrid dense (FAISS) and sparse (BM25) ranking.
+    """
+    # Step 1: Generate search tags from the query.
+    search_tags = iterative_convert_to_search_tags(query)
+    tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
+    # Step 2: Handle target language extraction.
+    target_lang = None
+    if any(tag.startswith("target-") for tag in tag_list):
+        target_tag = next(tag for tag in tag_list if tag.startswith("target-"))
+        target_lang = target_tag.replace("target-", "")
+        lang_query = f"language:{target_lang}"
+        tag_list = [tag for tag in tag_list if not tag.startswith("target-")]
+    else:
+        lang_query = "language:python"
+    # Step 3: Build advanced search qualifiers.
+    advanced_qualifier = "in:name,description,readme"
+    all_repositories = []
+    # Loop over individual tags.
+    for tag in tag_list:
+        github_query = f"{tag} {advanced_qualifier} {lang_query}"
+        print("GitHub Query:", github_query)
+        repos = fetch_github_repositories(github_query, max_results=15)
+        all_repositories.extend(repos)
+    # Also perform a combined query using OR logic for higher recall.
+    combined_query = " OR ".join(tag_list)
+    combined_query = f"({combined_query}) {advanced_qualifier} {lang_query}"
+    print("Combined GitHub Query:", combined_query)
+    repos = fetch_github_repositories(combined_query, max_results=15)
+    all_repositories.extend(repos)
+    # Deduplicate repositories using the repo link.
+    unique_repositories = {}
+    for repo in all_repositories:
+        if repo["link"] not in unique_repositories:
+            unique_repositories[repo["link"]] = repo
+        else:
+            # Merge content if the repository appears in multiple queries.
+            existing_text = unique_repositories[repo["link"]]["combined_text"]
+            unique_repositories[repo["link"]]["combined_text"] = existing_text + "\n" + repo["combined_text"]
+    repositories = list(unique_repositories.values())
+    if not repositories:
+        return "No repositories found for your query."
+    # Step 4: Prepare documents by using the combined text (description + README).
+    docs = [repo.get("combined_text", "") for repo in repositories]
+    # Step 5: Compute dense embeddings and build the FAISS index.
+    doc_embeddings = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
+    if doc_embeddings.ndim == 1:
+        doc_embeddings = doc_embeddings.reshape(1, -1)
+    norms = np.linalg.norm(doc_embeddings, axis=1, keepdims=True)
+    norm_doc_embeddings = doc_embeddings / (norms + 1e-10)
+    query_embedding = model.encode(query, convert_to_numpy=True)
+    if query_embedding.ndim == 1:
+        query_embedding = query_embedding.reshape(1, -1)
+    norm_query_embedding = query_embedding / (np.linalg.norm(query_embedding) + 1e-10)
+    dim = norm_doc_embeddings.shape[1]
+    index = faiss.IndexFlatIP(dim)
+    index.add(norm_doc_embeddings)
+    k = norm_doc_embeddings.shape[0]
+    D, I = index.search(norm_query_embedding, k)
+    dense_scores = D.squeeze()
+    norm_dense_scores = robust_min_max_norm(dense_scores)
+    # Step 6: Compute BM25 scores with improved tokenization.
+    if BM25Okapi is not None:
+        tokenized_docs = [re.findall(r'\w+', doc.lower()) for doc in docs]
+        bm25 = BM25Okapi(tokenized_docs)
+        query_tokens = re.findall(r'\w+', query.lower())
+        bm25_scores = np.array(bm25.get_scores(query_tokens))
+        norm_bm25_scores = robust_min_max_norm(bm25_scores)
+    else:
+        norm_bm25_scores = np.zeros_like(norm_dense_scores)
+    # Step 7: Combine scores (with denser retrieval given higher weight).
+    alpha = 0.8  # Weight for dense retrieval
+    combined_scores = alpha * norm_dense_scores + (1 - alpha) * norm_bm25_scores
+    for idx, repo in enumerate(repositories):
+        repo["combined_score"] = float(combined_scores[idx])
+    # Step 8: Rank repositories and format output.
+    ranked_repositories = sorted(repositories, key=lambda x: x.get("combined_score", 0), reverse=True)
+    output = "\n=== Ranked Repositories ===\n"
+    for rank, repo in enumerate(ranked_repositories, 1):
+        output += f"Final Rank: {rank}\n"
+        output += f"Title: {repo['title']}\n"
+        output += f"Link: {repo['link']}\n"
+        output += f"Combined Score: {repo.get('combined_score', 0):.4f}\n"
+        snippet = repo['combined_text'][:300].replace('\n', ' ')
+        output += f"Snippet: {snippet}...\n"
+        output += '-' * 80 + "\n"
+    output += "\n=== End of Results ==="
+    return output
+# ---------------------------
+# Main Entry Point for Testing
+# ---------------------------
+if __name__ == "__main__":
+    test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
+    result = run_repository_ranking(test_query)
+    print(result)

src/__init__.py CHANGED Viewed

	@@ -0,0 +1,43 @@

+from pathlib import Path
+from dotenv import load_dotenv
+import os
+import requests
+# Load .env from the project root
+dotenv_path = Path(__file__).resolve().parents[1] / ".env"
+if dotenv_path.exists():
+    load_dotenv(dotenv_path=dotenv_path)
+# Get GitHub API key from environment
+GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")
+if not GITHUB_API_KEY:
+    raise ValueError("GITHUB_API_KEY not found in environment variables.")
+# Rest of your code remains the same...
+# Set up the GitHub Search API URL and headers.
+url = "https://api.github.com/search/repositories"
+headers = {
+    "Authorization": f"token {GITHUB_API_KEY}",
+    "Accept": "application/vnd.github.v3+json"
+}
+# Define a simple query.
+query = "data augmentation language:python"
+params = {
+    "q": query,
+    "sort": "stars",
+    "order": "desc",
+    "per_page": 10  # adjust the number of results as needed
+}
+response = requests.get(url, headers=headers, params=params)
+if response.status_code == 200:
+    data = response.json()
+    items = data.get("items", [])
+    print(f"Found {len(items)} repositories:")
+    for repo in items:
+        print(f"- {repo['full_name']}: {repo['html_url']}")
+else:
+    print(f"Error {response.status_code}: {response.json().get('message')}")

src/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (215 Bytes)

src/__pycache__/deepgit_lite.cpython-311.pyc DELETED Viewed

Binary file (16.7 kB)

src/deepgit_lite.py CHANGED Viewed

@@ -11,16 +11,22 @@ from dotenv import load_dotenv
 from pathlib import Path
 from langchain_groq import ChatGroq
 from langchain_core.prompts import ChatPromptTemplate
 # ---------------------------
 # Environment and .env Setup
 # ---------------------------
-dotenv_path = Path(__file__).resolve().parent.parent / ".env"
-load_dotenv(dotenv_path=str(dotenv_path))
 if "GITHUB_API_KEY" not in os.environ:
     raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
 # ---------------------------
 # Logging Setup
 # ---------------------------
@@ -28,31 +34,86 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
 logger = logging.getLogger(__name__)
 # ---------------------------
-# ChatGroq Integration Setup (for query enhancement and final justification)
 # ---------------------------
 llm_groq = ChatGroq(
-    model="llama-3.1-8b-instant",
     temperature=0.2,
-    max_tokens=100,
     timeout=15,
     max_retries=2
 )
-def enhance_query(original_query):
-    prompt = f"""You are an expert research assistant. Given the query: "{original_query}",
-please enhance and expand it by adding relevant technical keywords, recent research context,
-and details specifically related to the application of Chain of Thought prompting in large language models within a Python environment.
-Provide the refined query text."""
-    messages = [
-        ("system", "You are a helpful research assistant specializing in AI and software research."),
-        ("human", prompt)
-    ]
-    result = llm_groq.invoke(messages)
-    # Extract text content if available
-    if hasattr(result, "content"):
-        return result.content
-    return str(result)
 def justify_candidate(candidate, query):
     prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
@@ -104,11 +165,9 @@ def fetch_directory_markdown(repo_full_name, path, headers):
 def fetch_repo_documentation(repo_full_name, headers):
     doc_text = ""
-    # Fetch README first.
     readme = fetch_readme_content(repo_full_name, headers)
     if readme:
         doc_text += "# README\n" + readme
-    # Fetch additional markdown files and documentation directories.
     root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
     response = requests.get(root_url, headers=headers)
     if response.status_code == 200:
@@ -165,27 +224,33 @@ def fetch_github_repositories(query, max_results=1000, per_page=100):
 # Main Lite Workflow Function
 # ---------------------------
 def run_deepgit_lite(user_query):
-    # Stage 0: Query Enhancement using ChatGroq
-    logger.info("Enhancing query using ChatGroq...")
     original_query = user_query.strip()
-    enhanced_query = enhance_query(original_query)
-    logger.info(f"Enhanced Query: {enhanced_query}")
-    github_query = enhanced_query + " language:python"
     logger.info(f"Using GitHub query: {github_query}")
-    # Stage 1: Dense Retrieval with FAISS
     logger.info("Fetching repositories from GitHub...")
     repos = fetch_github_repositories(github_query)
     if not repos:
-        logger.error("No repositories found. Please refine your query.")
-        return "\nNo repositories found for your query. Please try a different query."
     docs = [repo.get("combined_doc", "") for repo in repos]
     logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
-    sem_model = SentenceTransformer("all-mpnet-base-v2")
     doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
-    # Check if embeddings array is empty or 1-dimensional
     if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
         logger.error("No document embeddings generated. Aborting dense retrieval.")
         return "\nFailed to generate document embeddings. Please try again."
@@ -210,7 +275,7 @@ def run_deepgit_lite(user_query):
     # Stage 2: Filtering Low-Star Repositories
     filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
     if not filtered_candidates:
-        filtered_candidates = ranked_by_semantic  # fallback if filtering is too strict
     logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
     # Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
@@ -228,7 +293,6 @@ def run_deepgit_lite(user_query):
     for repo in filtered_candidates:
         norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
         norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
-        # Weights: 60% semantic, 40% stars.
         repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
     final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
@@ -256,3 +320,8 @@ def run_deepgit_lite(user_query):
     result_text += "\n=== End of Results ==="
     return result_text

 from pathlib import Path
 from langchain_groq import ChatGroq
 from langchain_core.prompts import ChatPromptTemplate
+import re
+import getpass
 # ---------------------------
 # Environment and .env Setup
 # ---------------------------
+dotenv_path = Path(__file__).resolve().parents[1] / ".env"
+if dotenv_path.exists():
+    load_dotenv(dotenv_path=dotenv_path)
 if "GITHUB_API_KEY" not in os.environ:
     raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
+# Optionally, silence bitsandbytes warnings if desired.
+os.environ["BITSANDBYTES_NOWARN"] = "1"
 # ---------------------------
 # Logging Setup
 # ---------------------------
 logger = logging.getLogger(__name__)
 # ---------------------------
+# ChatGroq Integration Setup (for query conversion and final justification)
 # ---------------------------
 llm_groq = ChatGroq(
+    model="deepseek-r1-distill-llama-70b",
     temperature=0.2,
+    max_tokens=800,
     timeout=15,
     max_retries=2
 )
+# --- Query Conversion Functions ---
+prompt = ChatPromptTemplate.from_messages([
+    ("system",
+     """You are a GitHub search optimization expert.
+Your job is to:
+1. Read a user's query about tools, research, or tasks.
+2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
+3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
+4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
+   Use as many tags as necessary based on the query's complexity, but never more than five.
+5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
+   If no specific language is mentioned, do not include any target tag.
+Output Format:
+tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
+Rules:
+- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
+- Use terms commonly found in GitHub repo names, topics, or descriptions.
+- Avoid generic terms like "python", "ai", "tool", "project".
+- Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
+- Prefer real tools, popular methods, or dataset names when mentioned.
+- If your output does not strictly match the required format, correct it after your internal reasoning.
+- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
+Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
+"""),
+    ("human", "{query}")
+])
+chain = prompt | llm_groq
+def parse_search_tags(response: str) -> str:
+    """
+    Removes any internal commentary enclosed in <think> ... </think> tags using regex,
+    and returns only the final searchable tags.
+    """
+    cleaned = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
+    return cleaned
+def valid_tags(tags: str) -> bool:
+    """
+    Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
+    """
+    pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
+    return re.match(pattern, tags) is not None
+def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
+    print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
+    refined_query = query
+    tags_output = ""
+    for iteration in range(max_iterations):
+        print(f"\n🔄 Iteration {iteration+1}")
+        response = chain.invoke({"query": refined_query})
+        full_output = response.content.strip()
+        tags_output = parse_search_tags(full_output)
+        print(f"Output Tags: {tags_output}")
+        if valid_tags(tags_output):
+            print("✅ Valid tags format detected.")
+            return tags_output
+        else:
+            print("⚠️ Invalid tags format. Requesting refinement...")
+            refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
+    print("Final output (may be invalid):", tags_output)
+    # Fallback default tags if output is still invalid
+    fallback = "data-augmentation:llm-fine-tuning"
+    print(f"Using fallback search tags: {fallback}")
+    return fallback
+# --- Justification Function ---
 def justify_candidate(candidate, query):
     prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
 def fetch_repo_documentation(repo_full_name, headers):
     doc_text = ""
     readme = fetch_readme_content(repo_full_name, headers)
     if readme:
         doc_text += "# README\n" + readme
     root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
     response = requests.get(root_url, headers=headers)
     if response.status_code == 200:
 # Main Lite Workflow Function
 # ---------------------------
 def run_deepgit_lite(user_query):
+    # Stage 0: Query Conversion using iterative_convert_to_search_tags
+    logger.info("Converting query to searchable tags...")
     original_query = user_query.strip()
+    search_tags = iterative_convert_to_search_tags(original_query)
+    logger.info(f"Search Tags: {search_tags}")
+    # Convert colon-separated tags into a space-separated query string.
+    tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
+    github_query = " ".join(tag_list) + " language:python"
     logger.info(f"Using GitHub query: {github_query}")
+    # Stage 1: Dense Retrieval with FAISS - Fetch repositories using the query.
     logger.info("Fetching repositories from GitHub...")
     repos = fetch_github_repositories(github_query)
     if not repos:
+        logger.warning("No repositories found with converted query. Falling back to default query.")
+        fallback_query = "data augmentation language:python"
+        logger.info(f"Using fallback GitHub query: {fallback_query}")
+        repos = fetch_github_repositories(fallback_query)
+        if not repos:
+            logger.error("No repositories found with fallback query either.")
+            return "\nNo repositories found for your query. Please try a different query."
     docs = [repo.get("combined_doc", "") for repo in repos]
     logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
+    sem_model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
     doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
     if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
         logger.error("No document embeddings generated. Aborting dense retrieval.")
         return "\nFailed to generate document embeddings. Please try again."
     # Stage 2: Filtering Low-Star Repositories
     filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
     if not filtered_candidates:
+        filtered_candidates = ranked_by_semantic
     logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
     # Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
     for repo in filtered_candidates:
         norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
         norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
         repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
     final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
     result_text += "\n=== End of Results ==="
     return result_text
+# For debugging: if run directly, execute with an example query.
+if __name__ == "__main__":
+    test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
+    print(run_deepgit_lite(test_query))