Spaces:

NextGenLabs
/

ai_agents

Running

File size: 1,304 Bytes

56a3465

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS



def get_text_from_content_for_doc(content):
    text = ""
    for page in content:
        text += content[page]["texte"]
    return text

def get_text_from_content_for_audio(content):
    return content["transcription"]


def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, # the character length of the chunck
        chunk_overlap=100, # the character length of the overlap between chuncks
        length_function=len # the length function - in this case, character length (aka the python len() fn.)
    )
    chunks = text_splitter.split_text(text)
    return chunks

def get_vectorstore(text_chunks):
    embedding = OpenAIEmbeddings(model="text-embedding-3-small")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding)
    return vectorstore

def setup_rag(file_type,content):
    if file_type == "pdf":
        text = get_text_from_content_for_doc(content)
    elif file_type == "audio":
        text = get_text_from_content_for_audio(content)
    

    chunks = get_text_chunks(text)

    vectorstore = get_vectorstore(chunks)
    
    return vectorstore