import os from pyvi.ViTokenizer import tokenize from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter import pymongo from generate_embedding import generate_embedding os.environ["OPENAI_API_KEY"] = "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" # Connect DB client = pymongo.MongoClient( "mongodb+srv://rag:p9vojYc9fafYwxE9@rag.xswi7nq.mongodb.net/?retryWrites=true&w=majority&appName=RAG" ) db = client.rag collection = db.pdf def insertData(chunk): return collection.insert_many(chunk) def deleteByUserId(user_id: str): return collection.delete_many({"user_id": user_id}) def readFromPDF(): # load PDF loader = PyPDFLoader("data/cds.pdf") pages = loader.load_and_split() pages = list(filter(lambda page: page.metadata['page'] >= 10, pages)) text_splitter = RecursiveCharacterTextSplitter(chunk_size=768, chunk_overlap=200) chunks = text_splitter.split_documents(pages) items = [] for index, chunk in enumerate(chunks): print(index) items.append({"page_content": chunk.page_content, "index": index}) return items def indexData(user_id: str): items = readFromPDF() contents = [] for item in items: tokenized_page_content = tokenize(item["page_content"]) content = { "page_content": item["page_content"], "page_content_embedding": generate_embedding(tokenized_page_content), "user_id": user_id, "index": item["index"], } contents.append(content) deleteByUserId(user_id) insertData(contents) indexData("cds.pdf") # prompt = hub.pull("rlm/rag-prompt") # llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) # def format_docs(docs): # return "\n\n".join(doc.page_content for doc in docs) # rag_chain = ( # {"context": retriever | format_docs, "question": RunnablePassthrough()} # | prompt # | llm # | StrOutputParser() # )