Spaces:

mikemoz
/

gitllm

Sleeping

App Files Files Community

heaversm commited on Jul 17, 2024

Commit

449cbf5

1 Parent(s): a107c82

initial commit - command line only.

Browse files

Files changed (13) hide show

.gitignore +2 -1
data/db/chroma.sqlite3 +0 -0
lib/chain.py +58 -0
lib/entities.py +10 -0
lib/loader.py +18 -0
lib/models.py +44 -0
lib/repository.py +11 -0
lib/utils.py +46 -0
main.py +68 -0
prompt_templates/README.md +4 -0
prompt_templates/evaluation_prompt.txt +4 -0
prompt_templates/initial_prompt.txt +4 -0
requirements.txt +3 -1

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 .env
-.venv

 .env
+.venv
+**/__pycache__/*

data/db/chroma.sqlite3 ADDED Viewed

Binary file (156 kB). View file

lib/chain.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+from operator import itemgetter
+from langchain_chroma import Chroma
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.runnables import RunnablePassthrough, RunnableParallel
+from langchain_core.output_parsers import JsonOutputParser
+from langchain.prompts import PromptTemplate
+from lib.models import MODELS_MAP
+from lib.utils import format_docs, retrieve_answer, load_embeddings
+from lib.entities import LLMEvalResult
+def create_retriever(llm_name, db_path, docs, collection_name="local-rag"):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=60)
+    splits = text_splitter.split_documents(docs)
+    embeddings = load_embeddings(llm_name)
+    if not os.path.exists(db_path):
+        vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=db_path, collection_name=collection_name)
+    else:
+        vectorstore = Chroma(persist_directory=db_path, embedding_function=embeddings, collection_name=collection_name)
+    retriever = vectorstore.as_retriever()
+    return retriever
+def create_qa_chain(llm, retriever, prompts_text):
+    initial_prompt_text = prompts_text["initial_prompt"]
+    qa_eval_prompt_text = prompts_text["evaluation_prompt"]
+    initial_prompt = PromptTemplate(
+        template=initial_prompt_text,
+        input_variables=["question", "context"]
+    )
+    json_parser = JsonOutputParser(pydantic_object=LLMEvalResult)
+    qa_eval_prompt = PromptTemplate(
+        template=qa_eval_prompt_text,
+        input_variables=["question","answer"],
+        partial_variables={"format_instructions": json_parser.get_format_instructions()},
+    )
+    qa_eval_prompt_with_context = PromptTemplate(
+        template=qa_eval_prompt_text,
+        input_variables=["question","answer","context"],
+        partial_variables={"format_instructions": json_parser.get_format_instructions()},
+    )
+    chain = (
+        RunnableParallel(context = retriever | format_docs, question = RunnablePassthrough()) |
+        RunnableParallel(answer = initial_prompt | llm | retrieve_answer, question = itemgetter("question"), context = itemgetter("context") ) |
+        RunnableParallel(input =  qa_eval_prompt, context = itemgetter("context"), answer = itemgetter("answer")) |
+        RunnableParallel(evaluation = itemgetter("input") | llm , context = itemgetter("context"), answer = itemgetter("answer") ) |
+        RunnableParallel(output = itemgetter("answer"), evaluation = itemgetter("evaluation") | json_parser,  context = itemgetter("context"))
+    )
+    return chain

lib/entities.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from enum import Enum
+from langchain_core.pydantic_v1 import BaseModel, Field
+class AccuracyEnum(str, Enum):
+    accurate = "accurate"
+    inaccurate = "inaccurate"
+class LLMEvalResult(BaseModel):
+    accuracy: AccuracyEnum = Field(description="Label indicating if the answer is accurate or inaccurate.")
+    feedback: str = Field(description="Explanation of why the specific label was assigned. Must be concise and not more than 2 sentences.")

lib/loader.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+from langchain_community.document_loaders.generic import GenericLoader
+from langchain_community.document_loaders.parsers import LanguageParser
+from langchain_text_splitters import Language
+def load_files(repository_path):
+    loader = GenericLoader.from_filesystem(
+        repository_path,
+        glob="**/*",
+        suffixes=[".py"],
+        parser=LanguageParser(
+            language=Language.PYTHON
+        )
+    )
+    docs = loader.load()
+    return docs

lib/models.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+from dotenv import load_dotenv
+from langchain_openai import OpenAI
+from langchain_groq import ChatGroq
+from langchain_openai import OpenAIEmbeddings
+from langchain_huggingface import HuggingFaceEmbeddings
+load_dotenv()
+MODELS_MAP = {
+    "OpenAI gpt-4o": {
+        "class": OpenAI,
+        "params": {
+            "temperature": 0,
+            "api_key": os.getenv("OPENAI_API_KEY")
+        },
+        "embedding_class": OpenAIEmbeddings,
+        "embedding_params": {
+            "api_key": os.getenv("OPENAI_API_KEY")
+        }
+    },
+    "Groq LLaMA3 70b": {
+        "class": ChatGroq,
+        "params": {
+            "model_name": "llama3-70b-8192",
+            "groq_api_key": os.getenv("GROQ_API_KEY")
+        },
+        "embedding_class": HuggingFaceEmbeddings,
+        "embedding_params": {
+            "model_name": "sentence-transformers/all-MiniLM-L6-v2"
+        }
+    },
+    "Groq Mixtral 8x7b": {
+        "class": ChatGroq,
+        "params": {
+            "model_name": "mixtral-8x7b-32768",
+            "groq_api_key": os.getenv("GROQ_API_KEY")
+        },
+        "embedding_class": HuggingFaceEmbeddings,
+        "embedding_params": {
+            "model_name": "sentence-transformers/all-MiniLM-L6-v2"
+        }
+    }
+}

lib/repository.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+import git
+def download_github_repo(repo_url, repo_dir):
+    if os.path.exists(repo_dir):
+        print(f"Repository {repo_dir} already exists. Pulling latest changes.")
+        repo = git.Repo(repo_dir)
+        repo.remotes.origin.pull()
+    else:
+        print(f"Cloning repository from {repo_url} to {repo_dir}.")
+        git.Repo.clone_from(repo_url, repo_dir)

lib/utils.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from lib.models import MODELS_MAP
+def read_prompt(file_name):
+    with open(file_name, 'r') as file:
+        return file.read()
+def format_docs(docs):
+    return "\n\n".join(doc.page_content for doc in docs)
+def retrieve_answer(output):
+    # print(f"Output: {output}")
+    # return output.content
+    return output
+def load_LLM(llm_name):
+    model_config = MODELS_MAP[llm_name]
+    model_class = model_config["class"]
+    params = model_config["params"]
+    llm = model_class(**params)
+    return llm
+def load_embeddings(llm_name):
+    model_config = MODELS_MAP[llm_name]
+    embedding_class = model_config["embedding_class"]
+    embedding_params = model_config["embedding_params"]
+    embeddings = embedding_class(**embedding_params)
+    return embeddings
+def get_available_models():
+    return list(MODELS_MAP.keys())
+def select_model():
+    models = get_available_models()
+    print("Available Models:")
+    for i, model in enumerate(models):
+        print(f"{i + 1}. {model}")
+    while True:
+        try:
+            choice = int(input("Select a model by number: ")) - 1
+            if 0 <= choice < len(models):
+                return models[choice]
+            else:
+                print("Invalid choice. Please select a number from the list.")
+        except ValueError:
+            print("Invalid input. Please enter a number.")

main.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import argparse
+import os
+from dotenv import load_dotenv
+from langchain.globals import set_debug
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from lib.repository import download_github_repo
+from lib.loader import load_files
+from lib.chain import create_retriever, create_qa_chain
+from lib.utils import read_prompt, load_LLM, select_model
+from lib.models import MODELS_MAP
+# set_debug(True)
+def main():
+    # Prompt user to select the model
+    model_name = select_model()
+    model_info = MODELS_MAP[model_name]
+    # Parse the command line arguments
+    parser = argparse.ArgumentParser(description="GitHub Repo QA CLI Application")
+    parser.add_argument("repo_url", type=str, help="URL of the GitHub repository")
+    args = parser.parse_args()
+    # Extract the repository name from the URL
+    repo_url = args.repo_url
+    repo_name = repo_url.split("/")[-1].replace(".git", "")
+    # Compute the path to the data folder relative to the script's directory
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_dir = os.path.join(base_dir, "data", repo_name)
+    db_dir = os.path.join(base_dir, "data", "db")
+    prompt_templates_dir = os.path.join(base_dir, "prompt_templates")
+    # Download the GitHub repository
+    print(f"Downloading repository from {repo_url}...")
+    download_github_repo(repo_url, repo_dir)
+    # Load prompt templates
+    prompts_text = {
+        "initial_prompt": read_prompt(os.path.join(prompt_templates_dir, 'initial_prompt.txt')),
+        "evaluation_prompt": read_prompt(os.path.join(prompt_templates_dir, 'evaluation_prompt.txt')),
+    }
+    # Load documents from the repository
+    print(f"Loading documents from {repo_dir}...")
+    document_chunks = load_files(repository_path=repo_dir)
+    print(f"Created chunks length is: {len(document_chunks)}")
+    # Create model, retriever
+    print(f"Creating retrieval QA chain using {model_name}...")
+    llm = load_LLM(model_name)
+    retriever = create_retriever(model_name, db_dir, document_chunks)
+    qa_chain = create_qa_chain(llm, retriever, prompts_text)
+    print("You can start asking questions. Type 'exit' to quit.")
+    while True:
+        question = input("Question: ")
+        if question.lower() == "exit":
+            break
+        answer = qa_chain.invoke(question)
+        print(f"Answer: {answer['output']}")
+if __name__ == "__main__":
+    main()

prompt_templates/README.md ADDED Viewed

	@@ -0,0 +1,4 @@

+`python -m venv .venv`
+`source .venv/bin/activate`
+`pip3 install -r requirements.txt`
+`python3 main.py https://github.com/streamlit/streamlit`

prompt_templates/evaluation_prompt.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+You are a technical assessor reviewing a test. You are provided with a question along with an answer for the question written by a developer. Evaluate the question-answer pair and provide feedback.
+{format_instructions}
+Question: {question}
+Answer: {answer}

prompt_templates/initial_prompt.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+You are an assistant for question-answering tasks in the software engineering field. Use the following pieces of retrieved context from the provided GitHub repository to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. If applicable, include a brief code snippet to illustrate your answer.
+Question: {question}
+Repository Context: {context}
+Answer:

requirements.txt CHANGED Viewed

@@ -15,4 +15,6 @@ langchain-text-splitters
 esprima
 tree_sitter
 tree_sitter_languages
-pysqlite3

 esprima
 tree_sitter
 tree_sitter_languages
+pysqlite3-binary
+git
+gradio