Spaces:

PabloVD
/

CAMELSDocBot

Sleeping

App Files Files Community

CAMELSDocBot / app.py

PabloVD

Replace pdf loading by urls loading

2ccbf76 4 months ago

raw

history blame

4.85 kB

	# https://python.langchain.com/docs/tutorials/rag/
	import gradio as gr
	from langchain import hub
	from langchain_chroma import Chroma
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.runnables import RunnablePassthrough
	from langchain_mistralai import MistralAIEmbeddings
	from langchain_community.embeddings import HuggingFaceInstructEmbeddings
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_mistralai import ChatMistralAI
	from langchain_community.document_loaders import PyPDFLoader
	import requests
	from pathlib import Path
	from langchain_community.document_loaders import WebBaseLoader
	import bs4
	from langchain_core.rate_limiters import InMemoryRateLimiter
	from urllib.parse import urljoin

	rate_limiter = InMemoryRateLimiter(
	requests_per_second=0.1, # <-- MistralAI free. We can only make a request once every second
	check_every_n_seconds=0.01, # Wake up every 100 ms to check whether allowed to make a request,
	max_bucket_size=10, # Controls the maximum burst size.
	)

	# # Get data from url
	# url = 'https://camels.readthedocs.io/_/downloads/en/latest/pdf/'
	# r = requests.get(url, stream=True)
	# document_path = Path('data.pdf')

	# document_path.write_bytes(r.content)
	# # document_path = "camels-readthedocs-io-en-latest.pdf"
	# loader = PyPDFLoader(document_path)
	# docs = loader.load()

	# # Load, chunk and index the contents of the blog.
	# url = "https://lilianweng.github.io/posts/2023-06-23-agent/"
	# loader = WebBaseLoader(
	# web_paths=(url,),
	# bs_kwargs=dict(
	# parse_only=bs4.SoupStrainer(
	# class_=("post-content", "post-title", "post-header")
	# )
	# ),
	# )
	# loader = WebBaseLoader(url)
	# docs = loader.load()

	# def get_subpages(base_url):
	# visited_urls = []
	# urls_to_visit = [base_url]

	# while urls_to_visit:
	# url = urls_to_visit.pop(0)
	# if url in visited_urls:
	# continue

	# visited_urls.append(url)
	# response = requests.get(url)
	# soup = bs4.BeautifulSoup(response.content, "html.parser")

	# for link in soup.find_all("a", href=True):
	# full_url = urljoin(base_url, link['href'])
	# if base_url in full_url and not full_url.endswith(".html") and full_url not in visited_urls:
	# urls_to_visit.append(full_url)
	# visited_urls = visited_urls[1:]

	# return visited_urls

	# base_url = "https://camels.readthedocs.io/en/latest/"
	# # base_url = "https://carla.readthedocs.io/en/latest/"
	# # urls = get_subpages(base_url)

	urlsfile = open("urls.txt")
	urls = urlsfile.readlines()
	urls = [url.replace("\n","") for url in urls]
	urlsfile.close()

	# Load, chunk and index the contents of the blog.
	loader = WebBaseLoader(urls)
	docs = loader.load()

	def format_docs(docs):
	return "\n\n".join(doc.page_content for doc in docs)

	def RAG(llm, docs, embeddings):

	# Split text
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	splits = text_splitter.split_documents(docs)

	# Create vector store
	vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

	# Retrieve and generate using the relevant snippets of the documents
	retriever = vectorstore.as_retriever()

	# Prompt basis example for RAG systems
	prompt = hub.pull("rlm/rag-prompt")

	# Create the chain
	rag_chain = (
	{"context": retriever \| format_docs, "question": RunnablePassthrough()}
	\| prompt
	\| llm
	\| StrOutputParser()
	)

	return rag_chain

	# LLM model
	llm = ChatMistralAI(model="mistral-large-latest", rate_limiter=rate_limiter)

	# Embeddings
	embed_model = "sentence-transformers/multi-qa-distilbert-cos-v1"
	# embed_model = "nvidia/NV-Embed-v2"
	embeddings = HuggingFaceInstructEmbeddings(model_name=embed_model)
	# embeddings = MistralAIEmbeddings()

	# RAG chain
	rag_chain = RAG(llm, docs, embeddings)

	def handle_prompt(message, history):
	try:
	# Stream output
	out=""
	for chunk in rag_chain.stream(message):
	out += chunk
	yield out
	except:
	raise gr.Error("Requests rate limit exceeded")

	greetingsmessage = "Hi, I'm the CAMELS DocBot, I'm here to assist you with any question related to the CAMELS simulations documentation"
	example_questions = [
	"How can i read a halo file?",
	"Which simulation suites are included in CAMELS?",
	"Which are the largest volumes in CAMELS simulations?",
	"How can I get the power spectrum of a simulation?"
	]

	demo = gr.ChatInterface(handle_prompt, type="messages", title="CAMELS DocBot", examples=example_questions, theme=gr.themes.Soft(), description=greetingsmessage)#, chatbot=chatbot)

	demo.launch()