Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders import WebBaseLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_chroma import Chroma | |
from langchain_openai import OpenAIEmbeddings | |
from PyPDF2 import PdfReader | |
def get_pdf_text(pdf_docs): | |
text = "" | |
for pdf in pdf_docs: | |
pdf_reader = PdfReader(pdf) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def loadUrlData(url): | |
loader = WebBaseLoader(url) | |
loader.requests_kwargs = {'verify':False} | |
html = loader.load() | |
return html | |
def splitDoc(data): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, chunk_overlap=200, add_start_index=True) | |
return text_splitter.split_documents(data) | |
def splitText(data): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=400, | |
chunk_overlap=50, | |
length_function=len, | |
is_separator_regex=False, | |
) | |
return text_splitter.split_text(data) | |
def vectorize(data, type): | |
if type == "document": | |
docs = splitDoc(data) | |
return Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings()) | |
elif type == "text": | |
texts = splitText(data) | |
return Chroma.from_texts(texts=texts, embedding=OpenAIEmbeddings()) | |