Spaces:
Runtime error
Runtime error
File size: 2,010 Bytes
841b0ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import os
from pyvi.ViTokenizer import tokenize
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import pymongo
from generate_embedding import generate_embedding
os.environ["OPENAI_API_KEY"] = "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"
# Connect DB
client = pymongo.MongoClient(
"mongodb+srv://rag:[email protected]/?retryWrites=true&w=majority&appName=RAG"
)
db = client.rag
collection = db.pdf
def insertData(chunk):
return collection.insert_many(chunk)
def deleteByUserId(user_id: str):
return collection.delete_many({"user_id": user_id})
def readFromPDF():
# load PDF
loader = PyPDFLoader("data/cds.pdf")
pages = loader.load_and_split()
pages = list(filter(lambda page: page.metadata['page'] >= 10, pages))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=768, chunk_overlap=200)
chunks = text_splitter.split_documents(pages)
items = []
for index, chunk in enumerate(chunks):
print(index)
items.append({"page_content": chunk.page_content, "index": index})
return items
def indexData(user_id: str):
items = readFromPDF()
contents = []
for item in items:
tokenized_page_content = tokenize(item["page_content"])
content = {
"page_content": item["page_content"],
"page_content_embedding": generate_embedding(tokenized_page_content),
"user_id": user_id,
"index": item["index"],
}
contents.append(content)
deleteByUserId(user_id)
insertData(contents)
indexData("cds.pdf")
# prompt = hub.pull("rlm/rag-prompt")
# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
# def format_docs(docs):
# return "\n\n".join(doc.page_content for doc in docs)
# rag_chain = (
# {"context": retriever | format_docs, "question": RunnablePassthrough()}
# | prompt
# | llm
# | StrOutputParser()
# )
|