KonstantinosKakkavas's picture
first
adbfd9e verified
raw
history blame
2.39 kB
import openai
from langchain.embeddings.openai import OpenAIEmbeddings
from pinecone import Pinecone
from langchain_openai import OpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.chains.summarize import load_summarize_chain
from langchain import HuggingFaceHub
from PyPDF2 import PdfReader
from langchain.schema import Document
def get_pdf_text(pdf_doc):
""" Extract text from pdf file """
text = ""
pdf_reader = PdfReader(pdf_doc)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def create_docs(user_pdf_list, unique_id):
""" Iterate over PDF files that user uploaded one by one"""
docs = []
for filename in user_pdf_list:
chunks = get_pdf_text(filename)
docs.append(Document(page_content=chunks,
metadata={"name": filename.name,
# "id": filename.id, todo error here because not all files have id as it seems...
"type": filename.type,
"size": filename.size, "unique_id": unique_id}))
return docs
def create_embeddings_load_data():
model = SentenceTransformer("all-MiniLM-L6-v2")
# Sentences are encoded by calling model.encode()
return model
def push_to_pinecone(pinecone_apikey, pinecone_index_name, embeddings: SentenceTransformer, docs: list[Document]):
"""function to push data to Vector database"""
pc = Pinecone(api_key=pinecone_apikey)
index = pc.Index(pinecone_index_name)
# # Transform documents to vectors before upserting
# vector_data = {}
# for doc in docs:
# # Assuming each doc is an instance of langchain.schema.Document
# # Extract the text content and convert to embedding
# vector = embeddings.encode(doc.page_content)
# # Use unique_id as key and vector as value
# vector_data[doc.metadata['unique_id']] = vector
for doc in docs:
doc.page_content = embeddings.encode(doc.page_content)
# content = ["ko ot", "ko ot", "ko ot", "ko ot", "ko ot"]
return docs
# index.upsert(embeddings.encode([doc.page_content for doc in docs]))
# def pull_from_pinecone(pinecone_apikey, pinecone_index_name, docs: list[Document]):
# if