Spaces:

Satyam0077
/

rag-qa-bot

Sleeping

App Files Files Community

rag-qa-bot / app.py

Satyam0077

Upload app.py

9b255cc verified 6 months ago

raw

history blame contribute delete

3.53 kB

	import gradio as gr
	import PyPDF2
	import cohere
	from pinecone import Pinecone
	from sentence_transformers import SentenceTransformer
	import io

	# Initialize Pinecone and connect to the index
	pc = Pinecone(api_key="0f78bc1b-81f7-4a15-9af3-0fbcf0acdb4e")
	index = pc.Index("quickstart")

	# Load the sentence transformer model
	model = SentenceTransformer('all-MiniLM-L6-v2')

	# Initialize Cohere with your API key
	co = cohere.Client("CxIrucBVA8NNJJOBUnxwRWq488MVydBku1DlqP1u")

	def extract_text_from_pdf(pdf_file):
	"""Extracts text from the uploaded PDF, with error handling."""
	try:
	if pdf_file is None:
	return "No file uploaded."

	# Read the PDF content from the file path
	with open(pdf_file.name, 'rb') as f:
	pdf_reader = PyPDF2.PdfReader(f)
	text = ""
	for page_num in range(len(pdf_reader.pages)):
	text += pdf_reader.pages[page_num].extract_text() or ""

	if not text.strip():
	return "The uploaded PDF is empty or has no readable content."
	return text

	except PyPDF2.errors.PdfReadError:
	return "The uploaded PDF is encrypted or unreadable."
	except Exception as e:
	return f"Error reading PDF: {str(e)}"

	def store_pdf_embeddings(pdf_text):
	"""Generate and store embeddings for the uploaded PDF content."""
	segments = [pdf_text[i:i + 512] for i in range(0, len(pdf_text), 512)]
	embeddings = model.encode(segments)
	vectors = [(f"seg-{i}", embed.tolist()) for i, embed in enumerate(embeddings)]
	index.upsert(vectors=vectors)
	return "PDF uploaded and stored successfully!"

	def ask_question(query):
	"""Handle user questions and generate answers based on the PDF content."""
	query_embedding = model.encode(query).tolist()

	# Retrieve the most relevant segment from Pinecone
	result = index.query(top_k=1, vector=query_embedding)
	retrieved_seg_id = result['matches'][0]['id']
	segment_text = f"Segment: {retrieved_seg_id}"

	# Generate the answer using the retrieved segment as context
	prompt = f"{segment_text}\nQuestion: {query}\nAnswer:"
	response = co.generate(
	model="command-xlarge-nightly",
	prompt=prompt,
	max_tokens=50
	)

	# Return both the segment and the answer
	answer = response.generations[0].text.strip()
	return segment_text, answer

	# Gradio Interface Setup
	with gr.Blocks() as demo:
	gr.Markdown("# Interactive QA Bot with PDF Support")

	# PDF Upload Section
	pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
	upload_status = gr.Textbox(label="Upload Status", interactive=False)
	upload_button = gr.Button("Upload and Store")

	# Handle PDF Upload
	upload_button.click(
	lambda pdf: store_pdf_embeddings(extract_text_from_pdf(pdf))
	if pdf is not None else "Please upload a valid PDF.",
	inputs=pdf_input, outputs=upload_status
	)

	# Question and Answer Section
	query_input = gr.Textbox(label="Enter your question")
	segment_output = gr.Textbox(label="Retrieved Segment", interactive=False)
	answer_output = gr.Textbox(label="Answer", interactive=False)
	query_button = gr.Button("Ask")

	# Handle User Questions
	query_button.click(
	ask_question, inputs=query_input, outputs=[segment_output, answer_output]
	)

	demo.launch(share=True) # Set share=True if you want a public link