first commit
Browse files- docker-compose.yml +19 -0
- main.py +420 -0
- requirements.txt +12 -0
- srs/utils.py +429 -0
docker-compose.yml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3.8'
|
2 |
+
services:
|
3 |
+
web:
|
4 |
+
build: .
|
5 |
+
ports:
|
6 |
+
- "80:80"
|
7 |
+
depends_on:
|
8 |
+
- redis
|
9 |
+
environment:
|
10 |
+
- REDIS_URL=redis://redis:6379
|
11 |
+
redis:
|
12 |
+
image: "redis/redis-stack-server:latest"
|
13 |
+
ports:
|
14 |
+
- "6379:6379"
|
15 |
+
volumes:
|
16 |
+
- redis_data:/data
|
17 |
+
|
18 |
+
volumes:
|
19 |
+
redis_data:
|
main.py
ADDED
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import redis
|
3 |
+
import requests
|
4 |
+
from fastapi import FastAPI, HTTPException, status
|
5 |
+
from typing import List
|
6 |
+
from datetime import datetime
|
7 |
+
from fastapi.concurrency import run_in_threadpool
|
8 |
+
from fastapi import Query
|
9 |
+
from srs.utils import functions_doc
|
10 |
+
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
11 |
+
import asyncio
|
12 |
+
from uuid import uuid4
|
13 |
+
from pydantic import BaseModel, validator
|
14 |
+
from fastapi import FastAPI, HTTPException, status, Depends
|
15 |
+
from pydantic import BaseModel,conlist
|
16 |
+
from typing import List, Optional,Dict, Any
|
17 |
+
import redis
|
18 |
+
import json
|
19 |
+
import logging
|
20 |
+
from fastapi.responses import JSONResponse
|
21 |
+
import base64
|
22 |
+
import PyPDF2
|
23 |
+
import io
|
24 |
+
r = redis.Redis(host='redis', port=6379, db=0, password=None,decode_responses=True)
|
25 |
+
# Instantiate your document functions class
|
26 |
+
|
27 |
+
app=FastAPI()
|
28 |
+
class ServiceRemovalRequest(BaseModel):
|
29 |
+
token: str
|
30 |
+
servicename: str
|
31 |
+
|
32 |
+
class Document(BaseModel):
|
33 |
+
token: str
|
34 |
+
service_name: str
|
35 |
+
document_name: str
|
36 |
+
|
37 |
+
|
38 |
+
class ServicesResponse(BaseModel):
|
39 |
+
message: str
|
40 |
+
added_services: List[str]
|
41 |
+
|
42 |
+
class RemoveDocumentsRequest(BaseModel):
|
43 |
+
token: str
|
44 |
+
service_name: str
|
45 |
+
document_names: List[str]
|
46 |
+
|
47 |
+
|
48 |
+
# Define request body model
|
49 |
+
class Service(BaseModel):
|
50 |
+
servicename: str
|
51 |
+
|
52 |
+
class TokenServicesRequest(BaseModel):
|
53 |
+
token: str
|
54 |
+
services: List[Service]
|
55 |
+
class services(BaseModel):
|
56 |
+
token: str
|
57 |
+
|
58 |
+
class AddDocumentRequest(BaseModel):
|
59 |
+
token: str
|
60 |
+
servicename: str
|
61 |
+
documentname: str
|
62 |
+
|
63 |
+
class StoreDocumentServicesRequest(BaseModel):
|
64 |
+
token: str
|
65 |
+
service_name: str
|
66 |
+
document_name: str
|
67 |
+
file: bytes
|
68 |
+
|
69 |
+
class DocumentChunks(BaseModel):
|
70 |
+
token: str
|
71 |
+
service_name: str
|
72 |
+
document_name: str
|
73 |
+
method: str = "chunk_per_page"
|
74 |
+
split_token: Optional[str] = ""
|
75 |
+
start_page: int = 1
|
76 |
+
end_page: int = 1
|
77 |
+
|
78 |
+
@validator('split_token', always=True)
|
79 |
+
def check_real_text(cls, v, values):
|
80 |
+
method = values.get('method')
|
81 |
+
if method == 'personalize_chunking' and not v:
|
82 |
+
raise ValueError('split_token is required when method is personalize_chunking')
|
83 |
+
return v
|
84 |
+
|
85 |
+
class DocumentRespons(BaseModel):
|
86 |
+
token: str
|
87 |
+
service_name: str
|
88 |
+
document_name: str
|
89 |
+
method: str = "chunk_per_page"
|
90 |
+
model : str = "gpt-3.5-turbo"
|
91 |
+
schema: dict
|
92 |
+
comment: Optional[dict] = {}
|
93 |
+
split_token: Optional[str] = ""
|
94 |
+
start_page: int = 1
|
95 |
+
end_page: int = 1
|
96 |
+
|
97 |
+
@validator('split_token', always=True)
|
98 |
+
def check_real_text(cls, v, values):
|
99 |
+
method = values.get('method')
|
100 |
+
if method == 'personalize_chunking' and not v:
|
101 |
+
raise ValueError('split_token is required when method is personalize_chunking')
|
102 |
+
return v
|
103 |
+
@app.post("/add_services", status_code=status.HTTP_201_CREATED)
|
104 |
+
async def add_services(request: TokenServicesRequest):
|
105 |
+
"""
|
106 |
+
Adds a list of services to a given token.
|
107 |
+
|
108 |
+
This endpoint accepts a request with a token and a list of services, attempting to add each service to the specified token.
|
109 |
+
The service information must include all necessary details like name and description.
|
110 |
+
|
111 |
+
Parameters:
|
112 |
+
- request (TokenServicesRequest): A model containing the authorization token and a list of services to be added.
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
- A list of dictionaries, each representing a successfully added service with its details.
|
116 |
+
|
117 |
+
Raises:
|
118 |
+
- HTTPException: 400 Bad Request if any value error occurs during processing, typically due to invalid input.
|
119 |
+
- HTTPException: 500 Internal Server Error if any unexpected errors occur during the process.
|
120 |
+
"""
|
121 |
+
try:
|
122 |
+
# Convert services to dicts
|
123 |
+
services_dicts = [service.dict() for service in request.services]
|
124 |
+
result = functions_doc.add_services_to_token(request.token, services_dicts)
|
125 |
+
return result
|
126 |
+
except ValueError as ve:
|
127 |
+
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
|
128 |
+
except Exception as e:
|
129 |
+
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
@app.delete("/remove_service/")
|
134 |
+
async def remove_service(request: ServiceRemovalRequest):
|
135 |
+
"""
|
136 |
+
Removes a specified service associated with a token.
|
137 |
+
|
138 |
+
This endpoint allows the removal of a service by its name from a list associated with a given token.
|
139 |
+
Before attempting to remove the service, it verifies that the token exists. If the token does not exist,
|
140 |
+
a 404 error is returned. If the service name does not exist under the token or cannot be removed,
|
141 |
+
a 400 error is raised with a specific message.
|
142 |
+
|
143 |
+
Parameters:
|
144 |
+
- request (ServiceRemovalRequest): A model containing the authorization token and the name of the service to be removed.
|
145 |
+
|
146 |
+
Returns:
|
147 |
+
- A dictionary with a success status and a message indicating the outcome of the operation.
|
148 |
+
|
149 |
+
Raises:
|
150 |
+
- HTTPException: 404 Not Found if the token does not exist.
|
151 |
+
- HTTPException: 400 Bad Request if the service cannot be removed or does not exist under the token.
|
152 |
+
- HTTPException: 500 Internal Server Error for any other unexpected errors.
|
153 |
+
"""
|
154 |
+
try:
|
155 |
+
# Check if the token exists in Redis
|
156 |
+
user_key = f"token:{request.token}:docservices"
|
157 |
+
if not r.exists(user_key):
|
158 |
+
raise HTTPException(status_code=404, detail="Token not found.")
|
159 |
+
# If checks pass, proceed to remove the service
|
160 |
+
manager_doc = functions_doc()
|
161 |
+
result = manager_doc.remove_service_by_name(token=request.token, servicename=request.servicename)
|
162 |
+
if result["success"]:
|
163 |
+
return {"success": True, "message": result["message"]}
|
164 |
+
else:
|
165 |
+
raise HTTPException(status_code=400, detail=result["message"])
|
166 |
+
except Exception as e:
|
167 |
+
raise HTTPException(status_code=500, detail=str(e))
|
168 |
+
|
169 |
+
|
170 |
+
@app.post("/add_and_store_document/", summary="Store a Document in Redis",
|
171 |
+
description="Stores a document as a base64 encoded string in Redis. The document is tagged with additional metadata and associated with a unique key.")
|
172 |
+
async def add_and_store_document(request:StoreDocumentServicesRequest):
|
173 |
+
"""
|
174 |
+
Stores a file document in Redis as a base64 encoded string with its corresponding tags and document names.
|
175 |
+
|
176 |
+
Args:
|
177 |
+
token (str): The unique identifier for the user.
|
178 |
+
service_name (str): The service under which the document will be stored.
|
179 |
+
document_name (str): The name of the document to be stored.
|
180 |
+
file (UploadFile): The file document to be stored as a base64 encoded string.
|
181 |
+
|
182 |
+
Returns:
|
183 |
+
JSONResponse: A JSON response indicating the status of the operation ("success" or "error") and a message describing the result or the error encountered.
|
184 |
+
|
185 |
+
Raises:
|
186 |
+
HTTPException: An HTTP exception is raised with a status code of 400 or 500 depending on the type of error during the storing process.
|
187 |
+
"""
|
188 |
+
try:
|
189 |
+
# Read file content as bytes
|
190 |
+
#encoded_file = await request.file.read()
|
191 |
+
|
192 |
+
# Store the document
|
193 |
+
# doc_manager = functions_doc()
|
194 |
+
response = functions_doc.add_and_store_document(token=request.token, service_name=request.service_name,document_name= request.document_name, encoded_file=request.file)
|
195 |
+
return JSONResponse(status_code=200, content={"status":"success", "message":response})
|
196 |
+
|
197 |
+
except redis.RedisError as e:
|
198 |
+
logging.error(f"Failed to store document: {e}")
|
199 |
+
return JSONResponse(status_code=500, content={"status": "error", "message": str(e)})
|
200 |
+
|
201 |
+
except Exception as e:
|
202 |
+
logging.error(f"An error occurred: {e}")
|
203 |
+
return JSONResponse(status_code=500, content={"status": "error", "message": "An unexpected error occurred"})
|
204 |
+
|
205 |
+
@app.delete("/remove_documents/", summary="Remove Multiple Documents",
|
206 |
+
description="Removes multiple documents from both the Redis store and the specified service list under a given token.")
|
207 |
+
async def remove_documents(request: RemoveDocumentsRequest):
|
208 |
+
"""
|
209 |
+
Removes multiple documents from Redis storage and their references from a specified service list under a user's token.
|
210 |
+
|
211 |
+
Args:
|
212 |
+
request (RemoveDocumentsRequest): A Pydantic model that includes the token, the service name, and a list of document names to be removed.
|
213 |
+
|
214 |
+
Returns:
|
215 |
+
dict: A dictionary indicating the status of the operation ("success" or "error") and a message describing the result or error encountered.
|
216 |
+
|
217 |
+
Raises:
|
218 |
+
HTTPException: An HTTP exception is raised with status code 400 or 500, depending on the type of error during the document removal process.
|
219 |
+
"""
|
220 |
+
try:
|
221 |
+
manager_doc = functions_doc()
|
222 |
+
response = manager_doc.remove_documents_from_service(token = request.token, service_name = request.service_name, document_names = request.document_names)
|
223 |
+
return response
|
224 |
+
except Exception as e:
|
225 |
+
raise HTTPException(status_code=400, detail=str(e))
|
226 |
+
|
227 |
+
@app.get("/services/", response_model=list)
|
228 |
+
def retrieve_service_names(request: services) -> list:
|
229 |
+
"""
|
230 |
+
Endpoint to retrieve service names for a given token.
|
231 |
+
|
232 |
+
:param token: The unique token of the user.
|
233 |
+
:return: A list of service names associated with the token or an empty list if none are found.
|
234 |
+
"""
|
235 |
+
services_key = f"token:{request.token}:docservices"
|
236 |
+
try:
|
237 |
+
existing_services = r.lrange(services_key, 0, -1)
|
238 |
+
service_names = [json.loads(service)['servicename'] for service in existing_services]
|
239 |
+
return service_names if service_names else []
|
240 |
+
except Exception as e:
|
241 |
+
raise HTTPException(status_code=500, detail=f"Failed to fetch or parse services: {str(e)}")
|
242 |
+
|
243 |
+
@app.get("/documents/", response_model=dict)
|
244 |
+
def retrieve_documents(request:ServiceRemovalRequest) -> dict:
|
245 |
+
"""
|
246 |
+
Endpoint to retrieve document names from a specific service for a given token.
|
247 |
+
|
248 |
+
:param token: The unique token of the user.
|
249 |
+
:param servicename: The name of the service from which to retrieve documents.
|
250 |
+
:return: A dictionary containing a list of documents or a message if the service is not found.
|
251 |
+
"""
|
252 |
+
try:
|
253 |
+
manager_doc = functions_doc()
|
254 |
+
response = manager_doc.get_documents_from_service(token = request.token, servicename = request.servicename)
|
255 |
+
return response
|
256 |
+
except Exception as e:
|
257 |
+
raise HTTPException(status_code=400, detail=str(e))
|
258 |
+
|
259 |
+
@app.get("/get_document/", response_model= Optional[bytes])
|
260 |
+
def get_document(request: Document) -> dict:
|
261 |
+
"""
|
262 |
+
Retrieves a document stored as base64-encoded bytes from a specified service for a given token.
|
263 |
+
|
264 |
+
This endpoint is responsible for fetching a document by name from a specific service associated with a token.
|
265 |
+
The document is expected to be stored in base64-encoded byte format. The response will include the document
|
266 |
+
if available or return an appropriate error message if not found or in case of an error.
|
267 |
+
|
268 |
+
Parameters:
|
269 |
+
- request (Document): A model containing the authorization token, service name, and document name.
|
270 |
+
|
271 |
+
Returns:
|
272 |
+
- A JSON response containing the document in base64-encoded format or an error message.
|
273 |
+
|
274 |
+
Raises:
|
275 |
+
- HTTPException: 400 Bad Request if there's an error during the retrieval process.
|
276 |
+
"""
|
277 |
+
try:
|
278 |
+
manager_doc = functions_doc()
|
279 |
+
response = manager_doc.get_document(token=request.token, service_name=request.service_name, document_name=request.document_name)
|
280 |
+
return JSONResponse(status_code=200, content=response)
|
281 |
+
except Exception as e:
|
282 |
+
raise HTTPException(status_code=400, detail=str(e))
|
283 |
+
@app.get("/get_num_pages/", response_model= dict)
|
284 |
+
def get_num_pages(request: Document) -> dict:
|
285 |
+
"""
|
286 |
+
Retrieves the number of pages in a PDF document from a specified service for a given token.
|
287 |
+
|
288 |
+
This endpoint fetches a document stored as a base64-encoded string, decodes it, and counts the number of pages using PyPDF2.
|
289 |
+
|
290 |
+
Parameters:
|
291 |
+
- request (Document): A model containing the authorization token, service name, and document name.
|
292 |
+
|
293 |
+
Returns:
|
294 |
+
- A JSON response with the status, message including the number of pages, and the number of pages if successful.
|
295 |
+
|
296 |
+
Raises:
|
297 |
+
- HTTPException: 400 Bad Request if there's an error during the retrieval or processing of the document.
|
298 |
+
"""
|
299 |
+
try:
|
300 |
+
manager_doc = functions_doc()
|
301 |
+
response = manager_doc.get_document(token=request.token, service_name=request.service_name, document_name=request.document_name)
|
302 |
+
if response["status"]=="success":
|
303 |
+
decoded_file = base64.b64decode(response["document"])
|
304 |
+
|
305 |
+
# Use BytesIO to create a file-like object in memory from the decoded data
|
306 |
+
pdf_file_like = io.BytesIO(decoded_file)
|
307 |
+
|
308 |
+
# Use PyPDF2 to read the file-like object and count the pages
|
309 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file_like)
|
310 |
+
number_of_pages = len(pdf_reader.pages)
|
311 |
+
return JSONResponse(status_code=200, content={"status": "success", "message": f"Document has {number_of_pages} pages.", "num_pages": number_of_pages})
|
312 |
+
return JSONResponse(status_code=200, content=response)
|
313 |
+
except Exception as e:
|
314 |
+
raise HTTPException(status_code=400, detail=str(e))
|
315 |
+
|
316 |
+
@app.get("/get_chunks/", response_model=dict)
|
317 |
+
def get_chunks(request: DocumentChunks) -> dict:
|
318 |
+
"""
|
319 |
+
Retrieves text chunks from a specified range of pages in a document according to the chunking method.
|
320 |
+
|
321 |
+
This endpoint decodes a stored document and processes it to extract text chunks from specified pages.
|
322 |
+
Users must specify a valid start and end page, and a chunking method. The method can be 'chunk_per_page'
|
323 |
+
for straightforward chunking by page, or 'personalize_chunking' which may use additional text parameters.
|
324 |
+
|
325 |
+
Parameters:
|
326 |
+
- request (DocumentChunks): A model containing the necessary details to fetch and chunk the document.
|
327 |
+
|
328 |
+
Returns:
|
329 |
+
- A dictionary response with the status, a message including the count of chunks, and the chunks themselves.
|
330 |
+
|
331 |
+
Raises:
|
332 |
+
- HTTPException: 400 Bad Request if there are parameter errors or processing fails.
|
333 |
+
"""
|
334 |
+
try:
|
335 |
+
manager_doc = functions_doc()
|
336 |
+
response = manager_doc.get_document(token=request.token, service_name=request.service_name, document_name=request.document_name)
|
337 |
+
if response["status"] == "success":
|
338 |
+
decoded_file = base64.b64decode(response["document"])
|
339 |
+
pdf_file_like = io.BytesIO(decoded_file)
|
340 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file_like)
|
341 |
+
number_of_pages = len(pdf_reader.pages)
|
342 |
+
|
343 |
+
if request.start_page < 1 or request.end_page > number_of_pages or request.start_page > request.end_page:
|
344 |
+
raise HTTPException(status_code=400, detail="Invalid start_page or end_page.")
|
345 |
+
|
346 |
+
if request.method == "chunk_per_page":
|
347 |
+
chunks = manager_doc.extract_text_from_pdf(pdf_file_like, request.start_page, request.end_page)
|
348 |
+
return {"status": "success", "message": f"Document has {len(chunks)} chunk(s).", "chunks": chunks}
|
349 |
+
elif request.method == "personalize_chunking":
|
350 |
+
# Assuming you process personalized chunking here:
|
351 |
+
personalized_chunks = manager_doc.personalize_chunking(request.split_token, pdf_file_like, request.start_page, request.end_page)
|
352 |
+
return {"status": "success", "message": f"Document has {len(personalized_chunks)} personalized chunk(s).", "chunks": personalized_chunks}
|
353 |
+
else:
|
354 |
+
raise HTTPException(status_code=400, detail="Invalid method provided.")
|
355 |
+
return response
|
356 |
+
except Exception as e:
|
357 |
+
raise HTTPException(status_code=400, detail=str(e))
|
358 |
+
|
359 |
+
@app.get("/structure_response/", response_model=dict)
|
360 |
+
def structure_response(request: DocumentRespons)-> dict:
|
361 |
+
"""
|
362 |
+
Retrieves and processes chunks of a document into structured JSON based on specific criteria.
|
363 |
+
|
364 |
+
This endpoint decodes a stored document and processes it to extract and transform text chunks
|
365 |
+
from specified pages into structured JSON format. Users must specify a valid start and end page,
|
366 |
+
and a chunking method. The method can be 'chunk_per_page' for straightforward chunking by page,
|
367 |
+
or 'personalize_chunking' which may use additional text parameters. The model parameter can take
|
368 |
+
values like "gpt-3.5-turbo" or "gemini" for processing the chunks. The processing method and output
|
369 |
+
schema are specified by the user.
|
370 |
+
|
371 |
+
Parameters:
|
372 |
+
- request (DocumentRespons): A model containing the details needed to fetch, chunk, and structure the document.
|
373 |
+
|
374 |
+
Returns:
|
375 |
+
- A dictionary response with the status and the structured JSON if successful.
|
376 |
+
|
377 |
+
Raises:
|
378 |
+
- HTTPException: 400 Bad Request for parameter errors or processing issues.
|
379 |
+
- HTTPException: 500 Internal Server Error for any other unexpected errors.
|
380 |
+
"""
|
381 |
+
|
382 |
+
try:
|
383 |
+
# Assuming functions_doc() returns an instance with necessary methods
|
384 |
+
manager_doc = functions_doc()
|
385 |
+
response = manager_doc.get_document(token=request.token, service_name=request.service_name, document_name=request.document_name)
|
386 |
+
if response["status"] == "success":
|
387 |
+
decoded_file = base64.b64decode(response["document"])
|
388 |
+
pdf_file_like = io.BytesIO(decoded_file)
|
389 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file_like)
|
390 |
+
number_of_pages = len(pdf_reader.pages)
|
391 |
+
|
392 |
+
if request.start_page < 1 or request.end_page > number_of_pages or request.start_page > request.end_page:
|
393 |
+
raise HTTPException(status_code=400, detail="Invalid start_page or end_page.")
|
394 |
+
|
395 |
+
if request.method == "chunk_per_page":
|
396 |
+
chunks = manager_doc.extract_text_from_pdf(pdf_file_like, request.start_page, request.end_page)
|
397 |
+
json_list = process_chunks(chunks, manager_doc, request.schema, request.model,request.comment)
|
398 |
+
return {"status": "success", "json": json_list}
|
399 |
+
elif request.method == "personalize_chunking":
|
400 |
+
personalized_chunks = manager_doc.personalize_chunking(request.split_token, pdf_file_like, request.start_page, request.end_page)
|
401 |
+
json_list = process_chunks(personalized_chunks, manager_doc, request.schema, request.model,request.comment)
|
402 |
+
return {"status": "success", "json": json_list}
|
403 |
+
else:
|
404 |
+
raise HTTPException(status_code=400, detail="Invalid method provided.")
|
405 |
+
return response
|
406 |
+
except Exception as e:
|
407 |
+
raise HTTPException(status_code=500, detail=str(e))
|
408 |
+
|
409 |
+
def process_chunks(chunks, manager_doc, schema, model,comment):
|
410 |
+
json_list = []
|
411 |
+
for chunk in chunks:
|
412 |
+
try:
|
413 |
+
response = manager_doc.get_json(schema, chunk, model,comment)
|
414 |
+
except Exception as e:
|
415 |
+
response = {}
|
416 |
+
json_list.append(response)
|
417 |
+
return json_list
|
418 |
+
|
419 |
+
|
420 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn[standard]
|
3 |
+
pydantic
|
4 |
+
requests
|
5 |
+
typing
|
6 |
+
redis
|
7 |
+
numpy
|
8 |
+
gevent
|
9 |
+
PyPDF2
|
10 |
+
pdfplumber
|
11 |
+
openai
|
12 |
+
google-generativeai==0.7.0
|
srs/utils.py
ADDED
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import redis
|
2 |
+
import json
|
3 |
+
from redis.commands.search.field import TagField, VectorField,TextField
|
4 |
+
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
|
5 |
+
from redis.commands.search.query import Query
|
6 |
+
import logging
|
7 |
+
import numpy as np
|
8 |
+
import redis.commands.search
|
9 |
+
from typing import List, Dict, Any, Optional
|
10 |
+
import pdfplumber
|
11 |
+
import google.generativeai as genai
|
12 |
+
from openai import OpenAI
|
13 |
+
import os
|
14 |
+
import re
|
15 |
+
logging.basicConfig(level=logging.INFO)
|
16 |
+
logger = logging.getLogger()
|
17 |
+
r = redis.Redis(host='redis', port=6379, db=0, password=None,decode_responses=True)
|
18 |
+
|
19 |
+
class functions_doc:
|
20 |
+
def __init__(self):
|
21 |
+
self.client = OpenAI(api_key="sk-proj-YrJ4mMndLNB84kUXZ6WiT3BlbkFJJaAQjjupx7nImW0iAYcX")
|
22 |
+
GENERATION_CONFIG = {
|
23 |
+
"temperature": 0.2,
|
24 |
+
"top_p": 0.75,
|
25 |
+
"max_output_tokens": 6000,
|
26 |
+
}
|
27 |
+
SAFETY_SETTINGS= [
|
28 |
+
{
|
29 |
+
"category": "HARM_CATEGORY_HARASSMENT",
|
30 |
+
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"category": "HARM_CATEGORY_HATE_SPEECH",
|
34 |
+
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
38 |
+
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
42 |
+
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
43 |
+
},
|
44 |
+
]
|
45 |
+
# Define the default schema
|
46 |
+
|
47 |
+
# Set up the model
|
48 |
+
genai.configure(api_key="AIzaSyDksULr84HdEiR5ls42xo2_Wja3De3abVw")
|
49 |
+
|
50 |
+
self.model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
|
51 |
+
generation_config=GENERATION_CONFIG,
|
52 |
+
safety_settings=SAFETY_SETTINGS)
|
53 |
+
|
54 |
+
def add_services_to_token(token: str, services: List[Dict]) -> Dict:
|
55 |
+
"""
|
56 |
+
Add multiple services to a token's list of services, checking if a service with the same name already exists.
|
57 |
+
Each service is a dictionary with 'servicename' and 'modelname' (which could be default or empty).
|
58 |
+
|
59 |
+
:param token: The unique token of the user.
|
60 |
+
:param services: List of service dictionaries to add.
|
61 |
+
:return: A dictionary with a message and the list of added services.
|
62 |
+
"""
|
63 |
+
services_key = f"token:{token}:docservices"
|
64 |
+
try:
|
65 |
+
existing_services = r.lrange(services_key, 0, -1)
|
66 |
+
existing_service_names = [json.loads(service)['servicename'] for service in existing_services]
|
67 |
+
except Exception as e:
|
68 |
+
raise Exception("Failed to fetch or parse existing services: " + str(e))
|
69 |
+
|
70 |
+
if not services or not isinstance(services, list):
|
71 |
+
raise ValueError("Invalid services format. It must be a list of services.")
|
72 |
+
|
73 |
+
added_services = []
|
74 |
+
for service in services:
|
75 |
+
if not isinstance(service, dict) or 'servicename' not in service:
|
76 |
+
continue
|
77 |
+
|
78 |
+
servicename = service.get('servicename')
|
79 |
+
|
80 |
+
if servicename in existing_service_names:
|
81 |
+
continue
|
82 |
+
|
83 |
+
service_info = json.dumps({"servicename": servicename, "documents": []})
|
84 |
+
try:
|
85 |
+
r.rpush(services_key, service_info)
|
86 |
+
added_services.append(servicename)
|
87 |
+
except Exception as e:
|
88 |
+
raise Exception(f"Failed to add service {servicename}: " + str(e))
|
89 |
+
|
90 |
+
if not added_services:
|
91 |
+
raise Exception("No new services were added. They may already exist or input was invalid.")
|
92 |
+
|
93 |
+
return {"message": "Services successfully added.", "added_services": added_services}
|
94 |
+
|
95 |
+
def remove_service_by_name(self, token, servicename):
|
96 |
+
"""
|
97 |
+
Remove a service entry from Redis based on the servicename,
|
98 |
+
and also remove all associated documents.
|
99 |
+
|
100 |
+
Parameters:
|
101 |
+
token (str): Token to identify the user data.
|
102 |
+
servicename (str): Name of the service to be removed.
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
dict: Status and message of the operation.
|
106 |
+
"""
|
107 |
+
try:
|
108 |
+
# Define the user key
|
109 |
+
user_key = f"token:{token}:docservices"
|
110 |
+
# Start a Redis pipeline
|
111 |
+
pipe = r.pipeline()
|
112 |
+
|
113 |
+
# Retrieve the length of the list
|
114 |
+
list_length = r.llen(user_key)
|
115 |
+
|
116 |
+
for i in range(list_length):
|
117 |
+
# Retrieve each JSON string from the list
|
118 |
+
service_data = r.lindex(user_key, i)
|
119 |
+
if service_data:
|
120 |
+
# Convert JSON string to dictionary
|
121 |
+
data = json.loads(service_data)
|
122 |
+
if data["servicename"] == servicename:
|
123 |
+
# Remove all associated documents
|
124 |
+
documents = data["documents"]
|
125 |
+
document_names = [doc['documentname'] for doc in documents]
|
126 |
+
print(document_names)
|
127 |
+
self.remove_documents_from_service( token, servicename, document_names)
|
128 |
+
# Delete each document found
|
129 |
+
# Remove the JSON string from the list
|
130 |
+
pipe.lrem(user_key, 0, service_data)
|
131 |
+
|
132 |
+
# Execute the pipeline
|
133 |
+
pipe.execute()
|
134 |
+
return {"success": True, "message": f"Service {servicename} and all associated documents removed."}
|
135 |
+
except Exception as e:
|
136 |
+
return {"success": False, "message": str(e)}
|
137 |
+
def add_and_store_document(token: str, service_name: str, document_name: str, encoded_file: bytes) -> dict:
|
138 |
+
"""
|
139 |
+
Adds a document to a specific service within a user's token and immediately stores the document in Redis.
|
140 |
+
If the document name already exists in the service, it is not appended or stored again.
|
141 |
+
|
142 |
+
:param token: The unique token of the user.
|
143 |
+
:param service_name: The name of the service to which the document will be added.
|
144 |
+
:param document_name: The name of the document to add.
|
145 |
+
:param encoded_file: The base64 encoded file to be stored.
|
146 |
+
:return: A dictionary with a message indicating the result.
|
147 |
+
"""
|
148 |
+
services_key = f"token:{token}:docservices"
|
149 |
+
binary_key_key = f"token:{token}:{service_name}:binarykey"
|
150 |
+
|
151 |
+
try:
|
152 |
+
existing_services = r.lrange(services_key, 0, -1)
|
153 |
+
service_found = False
|
154 |
+
for i, service in enumerate(existing_services):
|
155 |
+
service_data = json.loads(service)
|
156 |
+
if service_data['servicename'] == service_name:
|
157 |
+
service_found = True
|
158 |
+
documents = service_data.get('documents', [])
|
159 |
+
|
160 |
+
if any(doc['documentname'] == document_name for doc in documents):
|
161 |
+
return {"message": "Document already exists in the service."}
|
162 |
+
|
163 |
+
# Auto-increment binary key
|
164 |
+
binary_key = r.incr(binary_key_key)
|
165 |
+
|
166 |
+
# Append new document info dictionary
|
167 |
+
documents.append({'documentname': document_name, 'binarykey': str(binary_key)})
|
168 |
+
service_data['documents'] = documents
|
169 |
+
updated_service = json.dumps(service_data)
|
170 |
+
r.lset(services_key, i, updated_service)
|
171 |
+
|
172 |
+
# Store the document in Redis
|
173 |
+
r.set(service_name + "_" + str(binary_key), encoded_file)
|
174 |
+
logging.info("Document stored successfully in Redis.")
|
175 |
+
return {"message": "Document successfully added and stored in the service."}
|
176 |
+
|
177 |
+
if not service_found:
|
178 |
+
return {"message": "Service not found."}
|
179 |
+
|
180 |
+
except redis.RedisError as e:
|
181 |
+
logging.error(f"Failed to store document: {e}")
|
182 |
+
return {"status": "error", "message": str(e)}
|
183 |
+
|
184 |
+
except Exception as e:
|
185 |
+
logging.error(f"An error occurred: {e}")
|
186 |
+
return {"status": "error", "message": "An unexpected error occurred"}
|
187 |
+
def personalize_chunking(self, real_text, pdf_path, start_page, end_page):
|
188 |
+
text = ""
|
189 |
+
with pdfplumber.open(pdf_path) as pdf:
|
190 |
+
# Only iterate over the desired page range
|
191 |
+
for page_number in range(start_page - 1, end_page):
|
192 |
+
page = pdf.pages[page_number]
|
193 |
+
# Extract text from the page with specific tolerances
|
194 |
+
text+= page.extract_text(x_tolerance=2, y_tolerance=4)
|
195 |
+
return text.split(real_text)
|
196 |
+
|
197 |
+
def extract_text_from_pdf(self, pdf_path, start_page, end_page):
|
198 |
+
chunks = []
|
199 |
+
with pdfplumber.open(pdf_path) as pdf:
|
200 |
+
# Only iterate over the desired page range
|
201 |
+
for page_number in range(start_page - 1, end_page):
|
202 |
+
page = pdf.pages[page_number]
|
203 |
+
# Extract text from the page with specific tolerances
|
204 |
+
text = page.extract_text(x_tolerance=2, y_tolerance=4)
|
205 |
+
chunks.append(text)
|
206 |
+
return chunks
|
207 |
+
def get_document(self, token: str, service_name: str, document_name: str) -> Optional[bytes]:
|
208 |
+
"""
|
209 |
+
Retrieve a stored PDF file from Redis based on the token, service_name, and document_name.
|
210 |
+
Each document is assumed to be stored with a unique key constructed from these parameters.
|
211 |
+
"""
|
212 |
+
|
213 |
+
try:
|
214 |
+
|
215 |
+
# Generate a binary key based on inputs
|
216 |
+
binary_key = self.get_binary_key(token=token, service_name=service_name, document_name=document_name)
|
217 |
+
|
218 |
+
# Retrieve the document from Redis
|
219 |
+
stored_file = r.get(service_name + "_" + str(binary_key))
|
220 |
+
|
221 |
+
if stored_file is None:
|
222 |
+
# Log and handle the case where no file is found
|
223 |
+
logging.info("No document found for the specified key.")
|
224 |
+
return {"status": "error", "message": "No document found for the specified key"}
|
225 |
+
else:
|
226 |
+
# Log success
|
227 |
+
logging.info("Document retrieved successfully from Redis.")
|
228 |
+
return {"status": "success", "message": "Document retrieved successfully from Redis.","document":stored_file}
|
229 |
+
|
230 |
+
except redis.RedisError as e:
|
231 |
+
# Log the Redis error
|
232 |
+
logging.error(f"Failed to retrieve document: {e}")
|
233 |
+
return {"status": "error", "message":f"Failed to retrieve document: {e}"}
|
234 |
+
|
235 |
+
except Exception as e:
|
236 |
+
# Handle other possible exceptions
|
237 |
+
logging.error(f"An error occurred: {e}")
|
238 |
+
return None
|
239 |
+
def get_documents_from_service(self, token: str, servicename: str) -> dict:
|
240 |
+
"""
|
241 |
+
Retrieve document names from a specific service within a specific token's list of services.
|
242 |
+
|
243 |
+
:param token: The unique token of the user.
|
244 |
+
:param servicename: The name of the service from which documents will be retrieved.
|
245 |
+
:return: A dictionary with a list of documents or a message indicating the result.
|
246 |
+
"""
|
247 |
+
services_key = f"token:{token}:docservices"
|
248 |
+
try:
|
249 |
+
existing_services = r.lrange(services_key, 0, -1)
|
250 |
+
for service in existing_services:
|
251 |
+
service_data = json.loads(service)
|
252 |
+
if service_data['servicename'] == servicename:
|
253 |
+
documents = service_data.get('documents', [])
|
254 |
+
|
255 |
+
return {"success": True, "documents": documents}
|
256 |
+
|
257 |
+
return {"message": "Service not found."}
|
258 |
+
except Exception as e:
|
259 |
+
raise Exception("Failed to fetch or parse services: " + str(e))
|
260 |
+
|
261 |
+
def get_binary_key(self, token:str, service_name:str, document_name:str):
|
262 |
+
result = self.get_documents_from_service(token=token, servicename=service_name)
|
263 |
+
docs = result.get("documents",[])
|
264 |
+
for doc in docs:
|
265 |
+
if doc['documentname']==document_name:
|
266 |
+
return doc['binarykey']
|
267 |
+
return None
|
268 |
+
# def remove_documents_from_service(self, token: str, service_name: str, document_names: list) -> dict:
|
269 |
+
# """
|
270 |
+
# Removes multiple PDF documents from Redis and their references from a specific service within a specific token's list of services.
|
271 |
+
# """
|
272 |
+
# try:
|
273 |
+
# services_key = f"token:{token}:docservices"
|
274 |
+
# existing_services = r.lrange(services_key, 0, -1)
|
275 |
+
# updated = False
|
276 |
+
|
277 |
+
# for i, service in enumerate(existing_services):
|
278 |
+
# service_data = json.loads(service)
|
279 |
+
# if service_data['servicename'] == service_name:
|
280 |
+
# documents = service_data.get('documents', [])
|
281 |
+
# new_documents = [doc for doc in documents if doc['documentname'] not in document_names]
|
282 |
+
|
283 |
+
# if len(documents) != len(new_documents):
|
284 |
+
# # Update the service data if any documents are removed
|
285 |
+
# service_data['documents'] = new_documents
|
286 |
+
# updated_service = json.dumps(service_data)
|
287 |
+
# r.lset(services_key, i, updated_service)
|
288 |
+
# updated = True
|
289 |
+
|
290 |
+
# # Remove documents from direct Redis storage
|
291 |
+
# for document_name in document_names:
|
292 |
+
# binary_key = self.get_binary_key(token = token, service_name = service_name, document_name =document_name)
|
293 |
+
# redis_key = service_name + "_" + str(binary_key)
|
294 |
+
# if r.exists(redis_key):
|
295 |
+
# r.delete(redis_key)
|
296 |
+
# logging.info(f"Document with key {redis_key} removed successfully from Redis.")
|
297 |
+
|
298 |
+
# if updated:
|
299 |
+
# return {"status": "success", "message": "Documents removed successfully from both Redis storage and service list."}
|
300 |
+
# else:
|
301 |
+
# return {"status": "error", "message": "No documents found in the service list or no changes were made."}
|
302 |
+
|
303 |
+
# except redis.RedisError as e:
|
304 |
+
# logging.error(f"Failed to delete documents: {e}")
|
305 |
+
# return {"status": "error", "message": str(e)}
|
306 |
+
|
307 |
+
# except Exception as e:
|
308 |
+
# logging.error(f"An error occurred: {e}")
|
309 |
+
# return {"status": "error", "message": "An unexpected error occurred"}
|
310 |
+
def remove_documents_from_service(self, token: str, service_name: str, document_names: List[str]) -> dict:
|
311 |
+
"""
|
312 |
+
Removes multiple PDF documents from Redis and their references from a specific service within a specific token's list of services.
|
313 |
+
"""
|
314 |
+
try:
|
315 |
+
services_key = f"token:{token}:docservices"
|
316 |
+
existing_services = r.lrange(services_key, 0, -1)
|
317 |
+
updated = False
|
318 |
+
for i, service in enumerate(existing_services):
|
319 |
+
service_data = json.loads(service)
|
320 |
+
if service_data['servicename'] == service_name:
|
321 |
+
|
322 |
+
documents = service_data.get('documents', [])
|
323 |
+
new_documents = [doc for doc in documents if doc['documentname'] not in document_names]
|
324 |
+
|
325 |
+
if len(documents) != len(new_documents):
|
326 |
+
# Remove documents from direct Redis storage
|
327 |
+
for document_name in document_names:
|
328 |
+
binary_key = self.get_binary_key(token=token, service_name=service_name, document_name=document_name)
|
329 |
+
redis_key = service_name + "_" + str(binary_key)
|
330 |
+
print("Redis key",redis_key)
|
331 |
+
if r.exists(redis_key):
|
332 |
+
r.delete(redis_key)
|
333 |
+
logging.info(f"Document with key {redis_key} removed successfully from Redis.")
|
334 |
+
|
335 |
+
# Update the service data if any documents are removed
|
336 |
+
service_data['documents'] = new_documents
|
337 |
+
updated_service = json.dumps(service_data)
|
338 |
+
r.lset(services_key, i, updated_service)
|
339 |
+
updated = True
|
340 |
+
|
341 |
+
|
342 |
+
|
343 |
+
if updated:
|
344 |
+
return {"status": "success", "message": "Documents removed successfully from both Redis storage and service list."}
|
345 |
+
else:
|
346 |
+
return {"status": "error", "message": "No documents found in the service list or no changes were made."}
|
347 |
+
|
348 |
+
except redis.RedisError as e:
|
349 |
+
logging.error(f"Failed to delete documents: {e}")
|
350 |
+
return {"status": "error", "message": str(e)}
|
351 |
+
|
352 |
+
except Exception as e:
|
353 |
+
logging.error(f"An error occurred: {e}")
|
354 |
+
return {"status": "error", "message": "An unexpected error occurred"}
|
355 |
+
def get_json(self, schema, context, model,comment):
|
356 |
+
prompt = "Your task is to extract information from context."
|
357 |
+
var = ""
|
358 |
+
if comment:
|
359 |
+
var = f"""**expilcation of keys in schema**: {comment}"""
|
360 |
+
instruction = f"""
|
361 |
+
**JSON Format (High Priority)**: Provide the output in a properly formatted JSON structure.
|
362 |
+
**Respect Schema (High Priority)**: Utilize the schema below to organize the extracted information from the context. If certain information is absent, leave the corresponding field empty.
|
363 |
+
**Error Handling**: If the context does not contain sufficient information to fulfill the requirements, return the following JSON response: {{"message": "Context lacks the desired information"}}.
|
364 |
+
|
365 |
+
```json
|
366 |
+
{{
|
367 |
+
{schema}
|
368 |
+
}}```
|
369 |
+
{var}
|
370 |
+
"""
|
371 |
+
template = f"""
|
372 |
+
{prompt}
|
373 |
+
Consider the following:
|
374 |
+
{instruction}
|
375 |
+
|
376 |
+
CONTEXT:
|
377 |
+
{context}
|
378 |
+
"""
|
379 |
+
if model=="gpt-3.5-turbo":
|
380 |
+
response = self.client.chat.completions.create(
|
381 |
+
model="gpt-3.5-turbo",
|
382 |
+
messages=[{"role": "user", "content": template}
|
383 |
+
])
|
384 |
+
pred_reponse = response.choices[0].message.content
|
385 |
+
return self.parse_json(pred_reponse)
|
386 |
+
elif model=="gemini":
|
387 |
+
|
388 |
+
response = self.model.generate_content(template)
|
389 |
+
pred_reponse = response.text
|
390 |
+
return self.parse_json(pred_reponse)
|
391 |
+
def clean_and_load_json(self, s):
|
392 |
+
# Remove comments
|
393 |
+
s = re.sub(r'#.*?\n', '', s)
|
394 |
+
|
395 |
+
# Remove trailing commas before closing brackets in lists and dictionaries
|
396 |
+
s = re.sub(r',\s*\n\s*(\]|\})', r'\1', s)
|
397 |
+
# Remove patterns like '\n ...\n'
|
398 |
+
s = re.sub(r'\n\s*\.\.\.\n', '', s)
|
399 |
+
# Remove comma before } or ]
|
400 |
+
s = re.sub(r',\s*(\]|\})', r'\1', s)
|
401 |
+
# Remove unnecessary whitespace
|
402 |
+
s = s.strip()
|
403 |
+
#print(s)
|
404 |
+
# Load the cleaned JSON string
|
405 |
+
return json.loads(s)
|
406 |
+
def parse_json(self, s):
|
407 |
+
try:
|
408 |
+
json_str = json.loads(s)
|
409 |
+
except:
|
410 |
+
# Find the index of the first occurrence of '{\n'
|
411 |
+
start_idx = s.find('{')
|
412 |
+
|
413 |
+
# Find the index of the last occurrence of '\n}'
|
414 |
+
end_idx = s.rfind('}')
|
415 |
+
|
416 |
+
# If either index is not found, raise an error
|
417 |
+
if start_idx == -1 or end_idx == -1:
|
418 |
+
raise ValueError("Could not find JSON object in the provided string.")
|
419 |
+
|
420 |
+
# Extract the JSON substring from start_idx to end_idx (inclusive) + 3 to include the closing '\n}'
|
421 |
+
json_str = s[start_idx:end_idx+1]
|
422 |
+
try:
|
423 |
+
json_str = json.loads(json_str)
|
424 |
+
except:
|
425 |
+
json_str = self.lean_and_load_json(json_str)
|
426 |
+
return json_str
|
427 |
+
|
428 |
+
|
429 |
+
|