from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import FAISS def get_text_from_content_for_doc(content): text = "" for page in content: text += content[page]["texte"] return text def get_text_from_content_for_audio(content): return content["transcription"] def get_text_chunks(text): text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, # the character length of the chunck chunk_overlap=100, # the character length of the overlap between chuncks length_function=len # the length function - in this case, character length (aka the python len() fn.) ) chunks = text_splitter.split_text(text) return chunks def get_vectorstore(text_chunks): embedding = OpenAIEmbeddings(model="text-embedding-3-small") vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding) return vectorstore def setup_rag(file_type,content): if file_type == "pdf": text = get_text_from_content_for_doc(content) elif file_type == "audio": text = get_text_from_content_for_audio(content) chunks = get_text_chunks(text) vectorstore = get_vectorstore(chunks) return vectorstore