Spaces:
Running
Running
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_community.vectorstores import FAISS | |
def get_text_from_content_for_doc(content): | |
text = "" | |
for page in content: | |
text += content[page]["texte"] | |
return text | |
def get_text_from_content_for_audio(content): | |
return content["transcription"] | |
def get_text_chunks(text): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, # the character length of the chunck | |
chunk_overlap=100, # the character length of the overlap between chuncks | |
length_function=len # the length function - in this case, character length (aka the python len() fn.) | |
) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
def get_vectorstore(text_chunks): | |
embedding = OpenAIEmbeddings(model="text-embedding-3-small") | |
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding) | |
return vectorstore | |
def setup_rag(file_type,content): | |
if file_type == "pdf": | |
text = get_text_from_content_for_doc(content) | |
elif file_type == "audio": | |
text = get_text_from_content_for_audio(content) | |
chunks = get_text_chunks(text) | |
vectorstore = get_vectorstore(chunks) | |
return vectorstore | |