SoumyaJ commited on
Commit
75f3f8a
·
verified ·
1 Parent(s): bafb004

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -131
app.py CHANGED
@@ -1,131 +1,167 @@
1
- from fastapi import FastAPI, UploadFile,File,HTTPException
2
- from fastapi.responses import JSONResponse
3
- from fastapi.middleware.cors import CORSMiddleware
4
- from dotenv import load_dotenv
5
- from langchain_community.document_loaders import PyMuPDFLoader
6
- from langchain_text_splitters import RecursiveCharacterTextSplitter
7
- from langchain_huggingface import HuggingFaceEmbeddings
8
- from langchain_core.prompts import ChatPromptTemplate
9
- from langchain_core.output_parsers import StrOutputParser
10
- from langchain_groq import ChatGroq
11
- from langchain_pinecone import PineconeVectorStore
12
- from langchain_core.runnables import RunnablePassthrough
13
- from pathlib import Path
14
- import uvicorn
15
- import shutil
16
- import os
17
- import hashlib
18
- from pinecone import Pinecone
19
-
20
- app = FastAPI()
21
-
22
- app.add_middleware(
23
- CORSMiddleware,
24
- allow_origins=["*"],
25
- allow_credentials=True,
26
- allow_methods=["*"],
27
- allow_headers=["*"],
28
- )
29
-
30
- UPLOAD_DIR = "uploads"
31
- os.makedirs(UPLOAD_DIR, exist_ok=True)
32
-
33
- index_name = "pinecone-chatbot"
34
-
35
- load_dotenv()
36
- os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
37
- os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
38
- os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
39
-
40
- llm = ChatGroq(model_name = "Llama3-8b-8192")
41
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
42
-
43
- prompt = '''You are given a context below. Use it to answer the question that follows.
44
- Provide a concise and factual response. If the answer is not in the context, simply state "I don't know based on context provided."
45
-
46
- <context>
47
- {context}
48
- </context>
49
-
50
- Question: {question}
51
- Answer:'''
52
-
53
- parser = StrOutputParser()
54
-
55
- pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
56
- index = pc.Index(name=index_name)
57
-
58
- def generate_file_id(file_path):
59
- hasher = hashlib.md5()
60
- with open(file_path, "rb") as f:
61
- hasher.update(f.read())
62
- return hasher.hexdigest()
63
-
64
- def delete_existing_embedding(file_id):
65
- index_stats = index.describe_index_stats()
66
- if index_stats["total_vector_count"] > 0:
67
- index.delete(delete_all=True)
68
-
69
- def tempUploadFile(filePath,file):
70
- with open(filePath,'wb') as buffer:
71
- shutil.copyfileobj(file.file, buffer)
72
-
73
- def loadAndSplitDocuments(filePath):
74
- loader = PyMuPDFLoader(filePath)
75
- docs = loader.load()
76
-
77
- splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
78
- final_chunks = splitter.split_documents(docs)
79
- return final_chunks
80
-
81
- def prepare_retriever(filePath = "", load_from_pinecone = False):
82
- if load_from_pinecone:
83
- vector_store = PineconeVectorStore.from_existing_index(index_name, embeddings)
84
- return vector_store.as_retriever(search_kwargs={"k": 5})
85
- elif filePath:
86
- doc_chunks = loadAndSplitDocuments(filePath)
87
- vector_data = []
88
-
89
- for i, doc in enumerate(doc_chunks):
90
- embedding = embeddings.embed_query(doc.page_content)
91
- if embedding:
92
- metadata = {
93
- "text": doc.page_content,
94
- "source": doc.metadata.get("source", "unknown"),
95
- "page": doc.metadata.get("page", i), # Add page info if available
96
- }
97
- vector_data.append((str(i), embedding, metadata))
98
- print(f"Upserting {len(vector_data)} records into Pinecone...")
99
- index.upsert(vectors=vector_data)
100
- print("Upsert complete")
101
-
102
- def get_retriever_chain(retriever):
103
- chat_prompt = ChatPromptTemplate.from_template(prompt)
104
- chain =({"context": retriever, "question": RunnablePassthrough()} | chat_prompt | llm | parser)
105
- return chain
106
-
107
- @app.post("/UploadFileInStore")
108
- def UploadFileInStore(file: UploadFile = File(...)):
109
- if not file.filename.endswith('.pdf'):
110
- raise HTTPException(status_code=400, detail="File must be a pdf file")
111
-
112
- filePath = Path(UPLOAD_DIR) / file.filename
113
- tempUploadFile(filePath,file)
114
- file_id = generate_file_id(filePath)
115
- delete_existing_embedding(file_id)
116
- prepare_retriever(filePath)
117
-
118
- if os.path.exists(filePath):
119
- os.remove(filePath)
120
- return JSONResponse({"message": "File uploaded successfully"})
121
-
122
-
123
- @app.get("/QnAFromPdf")
124
- async def QnAFromPdf(query: str):
125
- retriever = prepare_retriever(load_from_pinecone=True)
126
- chain = get_retriever_chain(retriever)
127
- response = chain.invoke(query)
128
- return response
129
-
130
- if __name__ == "__main__":
131
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile,File,HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from dotenv import load_dotenv
5
+ from langchain_community.document_loaders import PyMuPDFLoader
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from langchain_core.output_parsers import StrOutputParser
10
+ from langchain_groq import ChatGroq
11
+ from langchain_pinecone import PineconeVectorStore
12
+ from langchain_core.runnables import RunnablePassthrough
13
+ from pathlib import Path
14
+ import uvicorn
15
+ import shutil
16
+ import os
17
+ import hashlib
18
+ from pinecone import Pinecone
19
+ import fitz
20
+ import pytesseract
21
+ from PIL import Image
22
+ from langchain.schema import Document
23
+ import io
24
+
25
+ app = FastAPI()
26
+
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=["*"],
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ UPLOAD_DIR = "uploads"
36
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
37
+
38
+ index_name = "pinecone-chatbot"
39
+
40
+ load_dotenv()
41
+ os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
42
+ os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
43
+ os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
44
+
45
+ llm = ChatGroq(model_name = "Llama3-8b-8192")
46
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
47
+
48
+ prompt = '''You are given a context below. Use it to answer the question that follows.
49
+ Provide a concise and factual response. If the answer is not in the context, simply state "I don't know based on context provided."
50
+
51
+ <context>
52
+ {context}
53
+ </context>
54
+
55
+ Question: {question}
56
+ Answer:'''
57
+
58
+ parser = StrOutputParser()
59
+
60
+ pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
61
+ index = pc.Index(name=index_name)
62
+
63
+ def generate_file_id(file_path):
64
+ hasher = hashlib.md5()
65
+ with open(file_path, "rb") as f:
66
+ hasher.update(f.read())
67
+ return hasher.hexdigest()
68
+
69
+ def delete_existing_embedding(file_id):
70
+ index_stats = index.describe_index_stats()
71
+ if index_stats["total_vector_count"] > 0:
72
+ index.delete(delete_all=True)
73
+
74
+ def tempUploadFile(filePath,file):
75
+ with open(filePath,'wb') as buffer:
76
+ shutil.copyfileobj(file.file, buffer)
77
+
78
+ def loadAndSplitDocuments(filePath):
79
+ loader = PyMuPDFLoader(filePath)
80
+ docs = loader.load()
81
+
82
+ splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
83
+ final_chunks = splitter.split_documents(docs)
84
+ return final_chunks
85
+
86
+ def loadAndSplitPdfFile(filePath):
87
+ doc = fitz.open(filePath)
88
+ documents = []
89
+
90
+ for i, page in enumerate(doc):
91
+ text = page.get_text("text") # Extract text from page
92
+ metadata = {"source": filePath, "page": i + 1}
93
+
94
+ if text.strip():
95
+ documents.append(Document(page_content=text, metadata=metadata))
96
+
97
+ # Extract and process images with OCR
98
+ images = page.get_images(full=True)
99
+ for img_index, img in enumerate(images):
100
+ xref = img[0]
101
+ base_image = doc.extract_image(xref)
102
+ image_bytes = base_image["image"]
103
+ img = Image.open(io.BytesIO(image_bytes))
104
+
105
+ # Perform OCR on the image
106
+ ocr_text = pytesseract.image_to_string(img)
107
+ if ocr_text.strip():
108
+ img_metadata = metadata.copy()
109
+ img_metadata["type"] = "image"
110
+ img_metadata["image_index"] = img_index
111
+ documents.append(Document(page_content=ocr_text, metadata=img_metadata))
112
+
113
+ splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
114
+ final_chunks = splitter.split_documents(documents)
115
+ return final_chunks
116
+
117
+ def prepare_retriever(filePath = "", load_from_pinecone = False):
118
+ if load_from_pinecone:
119
+ vector_store = PineconeVectorStore.from_existing_index(index_name, embeddings)
120
+ return vector_store.as_retriever(search_kwargs={"k": 5})
121
+ elif filePath:
122
+ doc_chunks = loadAndSplitPdfFile(filePath)
123
+ vector_data = []
124
+
125
+ for i, doc in enumerate(doc_chunks):
126
+ embedding = embeddings.embed_query(doc.page_content)
127
+ if embedding:
128
+ metadata = {
129
+ "text": doc.page_content,
130
+ "source": doc.metadata.get("source", "unknown"),
131
+ "page": doc.metadata.get("page", i), # Add page info if available
132
+ }
133
+ vector_data.append((str(i), embedding, metadata))
134
+ print(f"Upserting {len(vector_data)} records into Pinecone...")
135
+ index.upsert(vectors=vector_data)
136
+ print("Upsert complete")
137
+
138
+ def get_retriever_chain(retriever):
139
+ chat_prompt = ChatPromptTemplate.from_template(prompt)
140
+ chain =({"context": retriever, "question": RunnablePassthrough()} | chat_prompt | llm | parser)
141
+ return chain
142
+
143
+ @app.post("/UploadFileInStore")
144
+ def UploadFileInStore(file: UploadFile = File(...)):
145
+ if not file.filename.endswith('.pdf'):
146
+ raise HTTPException(status_code=400, detail="File must be a pdf file")
147
+
148
+ filePath = Path(UPLOAD_DIR) / file.filename
149
+ tempUploadFile(filePath,file)
150
+ file_id = generate_file_id(filePath)
151
+ delete_existing_embedding(file_id)
152
+ prepare_retriever(filePath)
153
+
154
+ if os.path.exists(filePath):
155
+ os.remove(filePath)
156
+ return JSONResponse({"message": "File uploaded successfully"})
157
+
158
+
159
+ @app.get("/QnAFromPdf")
160
+ async def QnAFromPdf(query: str):
161
+ retriever = prepare_retriever(load_from_pinecone=True)
162
+ chain = get_retriever_chain(retriever)
163
+ response = chain.invoke(query)
164
+ return response
165
+
166
+ if __name__ == "__main__":
167
+ uvicorn.run(app, host="0.0.0.0", port=8000)