Spaces:
No application file
No application file
import openai | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from pinecone import Pinecone | |
from langchain_openai import OpenAI | |
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings | |
from sentence_transformers import SentenceTransformer | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain import HuggingFaceHub | |
from PyPDF2 import PdfReader | |
from langchain.schema import Document | |
def get_pdf_text(pdf_doc): | |
""" Extract text from pdf file """ | |
text = "" | |
pdf_reader = PdfReader(pdf_doc) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def create_docs(user_pdf_list, unique_id): | |
""" Iterate over PDF files that user uploaded one by one""" | |
docs = [] | |
for filename in user_pdf_list: | |
chunks = get_pdf_text(filename) | |
docs.append(Document(page_content=chunks, | |
metadata={"name": filename.name, | |
# "id": filename.id, todo error here because not all files have id as it seems... | |
"type": filename.type, | |
"size": filename.size, "unique_id": unique_id})) | |
return docs | |
def create_embeddings_load_data(): | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
# Sentences are encoded by calling model.encode() | |
return model | |
def push_to_pinecone(pinecone_apikey, pinecone_index_name, embeddings: SentenceTransformer, docs: list[Document]): | |
"""function to push data to Vector database""" | |
pc = Pinecone(api_key=pinecone_apikey) | |
index = pc.Index(pinecone_index_name) | |
# # Transform documents to vectors before upserting | |
# vector_data = {} | |
# for doc in docs: | |
# # Assuming each doc is an instance of langchain.schema.Document | |
# # Extract the text content and convert to embedding | |
# vector = embeddings.encode(doc.page_content) | |
# # Use unique_id as key and vector as value | |
# vector_data[doc.metadata['unique_id']] = vector | |
for doc in docs: | |
doc.page_content = embeddings.encode(doc.page_content) | |
# content = ["ko ot", "ko ot", "ko ot", "ko ot", "ko ot"] | |
return docs | |
# index.upsert(embeddings.encode([doc.page_content for doc in docs])) | |
# def pull_from_pinecone(pinecone_apikey, pinecone_index_name, docs: list[Document]): | |
# if |