Spaces:
Sleeping
Sleeping
File size: 1,290 Bytes
0c442d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from PyPDF2 import PdfReader
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def loadUrlData(url):
loader = WebBaseLoader(url)
loader.requests_kwargs = {'verify':False}
html = loader.load()
return html
def splitDoc(data):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=200, add_start_index=True)
return text_splitter.split_documents(data)
def splitText(data):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=50,
length_function=len,
is_separator_regex=False,
)
return text_splitter.split_text(data)
def vectorize(data, type):
if type == "document":
docs = splitDoc(data)
return Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings())
elif type == "text":
texts = splitText(data)
return Chroma.from_texts(texts=texts, embedding=OpenAIEmbeddings())
|