import gradio as gr import pdfplumber import os from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import OpenAIEmbeddings from langchain.vectorstores import Pinecone import pinecone import pandas as pd import time from pinecone.grpc import PineconeGRPC as Pinecone from pinecone import ServerlessSpec from langchain_pinecone import PineconeVectorStore # OpenAI API key openai_api_key = os.getenv("OPENAI_API_KEY") # Embedding using OpenAI embeddings = OpenAIEmbeddings(api_key=openai_api_key) # Initialize Pinecone with PineconeGRPC from pinecone import Pinecone pc = Pinecone(api_key=os.environ['PINECONE_API_KEY']) # Define index name and parameters index_name = "italy-kg" vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings) # Create a global list to store uploaded document records uploaded_documents = [] from datetime import datetime from langchain_core.documents import Document # Function to process PDF, extract text, split it into chunks, and upload to the vector DB def process_pdf(pdf_file,uploaded_documents): if pdf_file is None: return uploaded_documents, "No PDF file uploaded." with pdfplumber.open(pdf_file) as pdf: all_text = "" for page in pdf.pages: all_text += page.extract_text() # Split the text into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) chunks = text_splitter.split_text(all_text) # Embed and upload the chunks into the vector database chunk_ids = [] for chunk in chunks: document = Document(page_content=chunk) chunk_id = vectorstore.add_documents([document]) chunk_ids.append(chunk_id) # Update the upload history document_record = { "Document Name": pdf_file.name, "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "Chunks": len(chunks), "Pinecone Index": index_name } # Add the record to the global list uploaded_documents.append(document_record) # Convert the list of dictionaries into a list of lists for the dataframe table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents] return table_data, f"Uploaded {len(chunks)} chunks to the vector database." # Gradio Blocks app with PDF uploader and table for logs def process_pdf(file): # Extract text from PDF using pdfplumber with pdfplumber.open(file.name) as pdf: text = "" for page in pdf.pages: text += page.extract_text() # Split text using RecursiveCharacterTextSplitter documents = [Document(page_content=text)] text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) docs = text_splitter.split_documents(documents) # Add documents to Pinecone Vector Store vectorstore = PineconeVectorStore(index_name, embeddings) vectorstore.add_documents(docs) # Prepare log data log_data = { "File Name": [file.name], "File Size (KB)": [os.path.getsize(file.name) / 1024], "Number of Chunks": [len(docs)], "Timestamp": [time.strftime("%Y-%m-%d %H:%M:%S")] } # Create a DataFrame for logs df_logs = pd.DataFrame(log_data) return "PDF processed successfully!", df_logs # Gradio Interface with gr.Blocks() as demo: gr.Markdown("# PDF Uploader to Pinecone with Logs") # File upload component with gr.Column(): file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) # Button to trigger processing process_button = gr.Button("Process PDF and Upload") # Dataframe to display uploaded document records document_table = gr.Dataframe(headers=["Document Name", "Upload Time", "Chunks", "Pinecone Index"], interactive=False) # Output textbox for results output_textbox = gr.Textbox(label="Result") # Define button click action # process_button.click(fn=process_pdf, inputs=file_input, outputs=output_textbox) process_button.click(fn=process_pdf, inputs=[file_input, gr.State([])], outputs=[document_table, output_textbox]) demo.queue() demo.launch(show_error=True)