Spaces:

NextGenLabs
/

ai_agents

Running

Ilyas KHIAT

first push

56a3465 5 months ago

1.3 kB

	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.vectorstores import FAISS



	def get_text_from_content_for_doc(content):
	text = ""
	for page in content:
	text += content[page]["texte"]
	return text

	def get_text_from_content_for_audio(content):
	return content["transcription"]


	def get_text_chunks(text):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=500, # the character length of the chunck
	chunk_overlap=100, # the character length of the overlap between chuncks
	length_function=len # the length function - in this case, character length (aka the python len() fn.)
	)
	chunks = text_splitter.split_text(text)
	return chunks

	def get_vectorstore(text_chunks):
	embedding = OpenAIEmbeddings(model="text-embedding-3-small")
	vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding)
	return vectorstore

	def setup_rag(file_type,content):
	if file_type == "pdf":
	text = get_text_from_content_for_doc(content)
	elif file_type == "audio":
	text = get_text_from_content_for_audio(content)


	chunks = get_text_chunks(text)

	vectorstore = get_vectorstore(chunks)

	return vectorstore