File size: 1,290 Bytes
0c442d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from PyPDF2 import PdfReader

def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def loadUrlData(url):
    loader = WebBaseLoader(url)
    loader.requests_kwargs = {'verify':False}
    html = loader.load()
    return html

def splitDoc(data):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True)
    return text_splitter.split_documents(data)

def splitText(data):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)
    return text_splitter.split_text(data)

def vectorize(data, type):
    if type == "document":
        docs = splitDoc(data)
        return Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings())
    elif type == "text":
        texts = splitText(data)
        return Chroma.from_texts(texts=texts, embedding=OpenAIEmbeddings())