Spaces:

thewise
/

Chat-W-Git

Runtime error

Rohan Kataria

files

f3405cb over 1 year ago

4.73 kB


	import os
	import openai
	import sys
	sys.path.append('../..')
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
	from langchain.vectorstores import DocArrayInMemorySearch
	from langchain.document_loaders import TextLoader
	from langchain.chains import RetrievalQA, ConversationalRetrievalChain
	from langchain.memory import ConversationBufferMemory
	from langchain.chat_models import ChatOpenAI
	from langchain.document_loaders import TextLoader
	from langchain.document_loaders import GitLoader
	from langchain.llms import OpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.vectorstores import Chroma
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.prompts import PromptTemplate
	import datetime
	import shutil

	# Function to load the data from github using langchain with string type url, string type branch, string type file_filter
	def loader(url: str, branch: str, file_filter: str):
	repo_path = "./github_repo"
	if os.path.exists(repo_path):
	shutil.rmtree(repo_path)

	loader = GitLoader(
	clone_url= url,
	repo_path="./github_repo/",
	branch=branch,
	file_filter=lambda file_path: file_path.endswith(tuple(file_filter.split(','))) # Filter out files in Data but whole repo is cloned
	)

	data = loader.load()
	return data


	#Function to split the data into chunks using recursive character text splitter
	def split_data(data):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=150,
	length_function=len, # Function to measure the length of chunks while splitting
	add_start_index=True # Include the starting position of each chunk in metadata
	)
	chunks = splitter.split_documents(data)
	return chunks

	#Function to ingest the chunks into a vectorstore of doc
	def ingest_chunks(chunks):
	embedding = OpenAIEmbeddings()
	vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding)

	repo_path = "./github_repo"
	if os.path.exists(repo_path):
	shutil.rmtree(repo_path)

	return vector_store

	#Retreival function to get the data from the database and reply to the user
	def retreival(vector_store):
	# Selecting the right model
	current_date = datetime.datetime.now().date()
	if current_date < datetime.date(2023, 9, 2):
	llm_name = "gpt-3.5-turbo-0301"
	else:
	llm_name = "gpt-3.5-turbo"

	#Creating LLM
	llm = ChatOpenAI(model=llm_name, temperature=0)

	# Creating Prompt template
	template = """
	You're a code summarisation assistant. Given the following extracted parts of a long document and a question, create a final answer with "CODE SNIPPETS" from "SOURCE DOCUMENTS".
	If you don't know the answer, just say that you don't know. Don't try to make up an answer.
	ALWAYS return a "CODE SNIPPETS" from "SOURCE DOCUMENTS" part in your answer.

	QUESTION: {question}
	=========
	CONTEXT: {context}
	=========
	FINAL ANSWER:"""

	PROMPT = PromptTemplate(input_variables=["context", "question"], template=template,)

	#Creating memory
	memory = ConversationBufferMemory(
	memory_key="chat_history",
	input_key="question",
	output_key="answer",
	return_messages=True)

	#Creating the retriever, this can also be a contextual compressed retriever
	retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5}) #search_type can be "similarity" or "mmr"

	chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	chain_type="stuff", #chain type can be refine, stuff, map_reduce
	retriever=retriever,
	memory=memory,
	return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work
	combine_docs_chain_kwargs=dict({"prompt": PROMPT})
	)

	return chain

	#Class using all above components to create QA system
	class ConversationalResponse:
	def __init__(self, url, branch, file_filter):
	self.url = url
	self.branch = branch
	self.file_filter = file_filter
	self.data = loader(self.url, self.branch, self.file_filter)
	self.chunks = split_data(self.data)
	self.vector_store = ingest_chunks(self.chunks)
	self.chain_type = "stuff"
	self.k = 5
	self.chain = retreival(self.vector_store)

	def __call__(self, question):
	agent = self.chain(question)
	return agent['answer']