import openai from langchain.embeddings.openai import OpenAIEmbeddings from pinecone import Pinecone from langchain_openai import OpenAI from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from sentence_transformers import SentenceTransformer from langchain.chains.summarize import load_summarize_chain from langchain import HuggingFaceHub from PyPDF2 import PdfReader from langchain.schema import Document def get_pdf_text(pdf_doc): """ Extract text from pdf file """ text = "" pdf_reader = PdfReader(pdf_doc) for page in pdf_reader.pages: text += page.extract_text() return text def create_docs(user_pdf_list, unique_id): """ Iterate over PDF files that user uploaded one by one""" docs = [] for filename in user_pdf_list: chunks = get_pdf_text(filename) docs.append(Document(page_content=chunks, metadata={"name": filename.name, # "id": filename.id, todo error here because not all files have id as it seems... "type": filename.type, "size": filename.size, "unique_id": unique_id})) return docs def create_embeddings_load_data(): model = SentenceTransformer("all-MiniLM-L6-v2") # Sentences are encoded by calling model.encode() return model def push_to_pinecone(pinecone_apikey, pinecone_index_name, embeddings: SentenceTransformer, docs: list[Document]): """function to push data to Vector database""" pc = Pinecone(api_key=pinecone_apikey) index = pc.Index(pinecone_index_name) # # Transform documents to vectors before upserting # vector_data = {} # for doc in docs: # # Assuming each doc is an instance of langchain.schema.Document # # Extract the text content and convert to embedding # vector = embeddings.encode(doc.page_content) # # Use unique_id as key and vector as value # vector_data[doc.metadata['unique_id']] = vector for doc in docs: doc.page_content = embeddings.encode(doc.page_content) # content = ["ko ot", "ko ot", "ko ot", "ko ot", "ko ot"] return docs # index.upsert(embeddings.encode([doc.page_content for doc in docs])) # def pull_from_pinecone(pinecone_apikey, pinecone_index_name, docs: list[Document]): # if