SoumyaJ commited on
Commit
f9f63e1
·
verified ·
1 Parent(s): 90223df

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +131 -0
  2. requirements.txt +12 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile,File,HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from dotenv import load_dotenv
5
+ from langchain_community.document_loaders import PyMuPDFLoader
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from langchain_core.output_parsers import StrOutputParser
10
+ from langchain_groq import ChatGroq
11
+ from langchain_pinecone import PineconeVectorStore
12
+ from langchain_core.runnables import RunnablePassthrough
13
+ from pathlib import Path
14
+ import uvicorn
15
+ import shutil
16
+ import os
17
+ import hashlib
18
+ from pinecone import Pinecone
19
+
20
+ app = FastAPI()
21
+
22
+ app.add_middleware(
23
+ CORSMiddleware,
24
+ allow_origins=["*"],
25
+ allow_credentials=True,
26
+ allow_methods=["*"],
27
+ allow_headers=["*"],
28
+ )
29
+
30
+ UPLOAD_DIR = "uploads"
31
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
32
+
33
+ index_name = "pinecone-chatbot"
34
+
35
+ load_dotenv()
36
+ os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
37
+ os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
38
+ os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
39
+
40
+ llm = ChatGroq(model_name = "Llama3-8b-8192")
41
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
42
+
43
+ prompt = '''You are given a context below. Use it to answer the question that follows.
44
+ Provide a concise and factual response. If the answer is not in the context, simply state "I don't know based on context provided."
45
+
46
+ <context>
47
+ {context}
48
+ </context>
49
+
50
+ Question: {question}
51
+ Answer:'''
52
+
53
+ parser = StrOutputParser()
54
+
55
+ pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
56
+ index = pc.Index(name=index_name)
57
+
58
+ def generate_file_id(file_path):
59
+ hasher = hashlib.md5()
60
+ with open(file_path, "rb") as f:
61
+ hasher.update(f.read())
62
+ return hasher.hexdigest()
63
+
64
+ def delete_existing_embedding(file_id):
65
+ index_stats = index.describe_index_stats()
66
+ if index_stats["total_vector_count"] > 0:
67
+ index.delete(delete_all=True)
68
+
69
+ def tempUploadFile(filePath,file):
70
+ with open(filePath,'wb') as buffer:
71
+ shutil.copyfileobj(file.file, buffer)
72
+
73
+ def loadAndSplitDocuments(filePath):
74
+ loader = PyMuPDFLoader(filePath)
75
+ docs = loader.load()
76
+
77
+ splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
78
+ final_chunks = splitter.split_documents(docs)
79
+ return final_chunks
80
+
81
+ def prepare_retriever(filePath = "", load_from_pinecone = False):
82
+ if load_from_pinecone:
83
+ vector_store = PineconeVectorStore.from_existing_index(index_name, embeddings)
84
+ return vector_store.as_retriever(search_kwargs={"k": 5})
85
+ elif filePath:
86
+ doc_chunks = loadAndSplitDocuments(filePath)
87
+ vector_data = []
88
+
89
+ for i, doc in enumerate(doc_chunks):
90
+ embedding = embeddings.embed_query(doc.page_content)
91
+ if embedding:
92
+ metadata = {
93
+ "text": doc.page_content,
94
+ "source": doc.metadata.get("source", "unknown"),
95
+ "page": doc.metadata.get("page", i), # Add page info if available
96
+ }
97
+ vector_data.append((str(i), embedding, metadata))
98
+ print(f"Upserting {len(vector_data)} records into Pinecone...")
99
+ index.upsert(vectors=vector_data)
100
+ print("Upsert complete")
101
+
102
+ def get_retriever_chain(retriever):
103
+ chat_prompt = ChatPromptTemplate.from_template(prompt)
104
+ chain =({"context": retriever, "question": RunnablePassthrough()} | chat_prompt | llm | parser)
105
+ return chain
106
+
107
+ @app.post("/UploadFileInStore")
108
+ def UploadFileInStore(file: UploadFile = File(...)):
109
+ if not file.filename.endswith('.pdf'):
110
+ raise HTTPException(status_code=400, detail="File must be a pdf file")
111
+
112
+ filePath = Path(UPLOAD_DIR) / file.filename
113
+ tempUploadFile(filePath,file)
114
+ file_id = generate_file_id(filePath)
115
+ delete_existing_embedding(file_id)
116
+ prepare_retriever(filePath)
117
+
118
+ if os.path.exists(filePath):
119
+ os.remove(filePath)
120
+ return JSONResponse({"message": "File uploaded successfully"})
121
+
122
+
123
+ @app.get("/QnAFromPdf")
124
+ async def QnAFromPdf(query: str):
125
+ retriever = prepare_retriever(load_from_pinecone=True)
126
+ chain = get_retriever_chain(retriever)
127
+ response = chain.invoke(query)
128
+ return response
129
+
130
+ if __name__ == "__main__":
131
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ langchain-groq
4
+ langchain-core
5
+ python-dotenv
6
+ PyMuPDF
7
+ langchain-huggingface
8
+ sentence-transformers
9
+ langchain-pinecone
10
+ fastapi
11
+ uvicorn
12
+ python-multipart