Nechba commited on
Commit
0b644a6
1 Parent(s): 808e59f

first commit

Browse files
Files changed (4) hide show
  1. docker-compose.yml +19 -0
  2. main.py +420 -0
  3. requirements.txt +12 -0
  4. srs/utils.py +429 -0
docker-compose.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+ services:
3
+ web:
4
+ build: .
5
+ ports:
6
+ - "80:80"
7
+ depends_on:
8
+ - redis
9
+ environment:
10
+ - REDIS_URL=redis://redis:6379
11
+ redis:
12
+ image: "redis/redis-stack-server:latest"
13
+ ports:
14
+ - "6379:6379"
15
+ volumes:
16
+ - redis_data:/data
17
+
18
+ volumes:
19
+ redis_data:
main.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import redis
3
+ import requests
4
+ from fastapi import FastAPI, HTTPException, status
5
+ from typing import List
6
+ from datetime import datetime
7
+ from fastapi.concurrency import run_in_threadpool
8
+ from fastapi import Query
9
+ from srs.utils import functions_doc
10
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
11
+ import asyncio
12
+ from uuid import uuid4
13
+ from pydantic import BaseModel, validator
14
+ from fastapi import FastAPI, HTTPException, status, Depends
15
+ from pydantic import BaseModel,conlist
16
+ from typing import List, Optional,Dict, Any
17
+ import redis
18
+ import json
19
+ import logging
20
+ from fastapi.responses import JSONResponse
21
+ import base64
22
+ import PyPDF2
23
+ import io
24
+ r = redis.Redis(host='redis', port=6379, db=0, password=None,decode_responses=True)
25
+ # Instantiate your document functions class
26
+
27
+ app=FastAPI()
28
+ class ServiceRemovalRequest(BaseModel):
29
+ token: str
30
+ servicename: str
31
+
32
+ class Document(BaseModel):
33
+ token: str
34
+ service_name: str
35
+ document_name: str
36
+
37
+
38
+ class ServicesResponse(BaseModel):
39
+ message: str
40
+ added_services: List[str]
41
+
42
+ class RemoveDocumentsRequest(BaseModel):
43
+ token: str
44
+ service_name: str
45
+ document_names: List[str]
46
+
47
+
48
+ # Define request body model
49
+ class Service(BaseModel):
50
+ servicename: str
51
+
52
+ class TokenServicesRequest(BaseModel):
53
+ token: str
54
+ services: List[Service]
55
+ class services(BaseModel):
56
+ token: str
57
+
58
+ class AddDocumentRequest(BaseModel):
59
+ token: str
60
+ servicename: str
61
+ documentname: str
62
+
63
+ class StoreDocumentServicesRequest(BaseModel):
64
+ token: str
65
+ service_name: str
66
+ document_name: str
67
+ file: bytes
68
+
69
+ class DocumentChunks(BaseModel):
70
+ token: str
71
+ service_name: str
72
+ document_name: str
73
+ method: str = "chunk_per_page"
74
+ split_token: Optional[str] = ""
75
+ start_page: int = 1
76
+ end_page: int = 1
77
+
78
+ @validator('split_token', always=True)
79
+ def check_real_text(cls, v, values):
80
+ method = values.get('method')
81
+ if method == 'personalize_chunking' and not v:
82
+ raise ValueError('split_token is required when method is personalize_chunking')
83
+ return v
84
+
85
+ class DocumentRespons(BaseModel):
86
+ token: str
87
+ service_name: str
88
+ document_name: str
89
+ method: str = "chunk_per_page"
90
+ model : str = "gpt-3.5-turbo"
91
+ schema: dict
92
+ comment: Optional[dict] = {}
93
+ split_token: Optional[str] = ""
94
+ start_page: int = 1
95
+ end_page: int = 1
96
+
97
+ @validator('split_token', always=True)
98
+ def check_real_text(cls, v, values):
99
+ method = values.get('method')
100
+ if method == 'personalize_chunking' and not v:
101
+ raise ValueError('split_token is required when method is personalize_chunking')
102
+ return v
103
+ @app.post("/add_services", status_code=status.HTTP_201_CREATED)
104
+ async def add_services(request: TokenServicesRequest):
105
+ """
106
+ Adds a list of services to a given token.
107
+
108
+ This endpoint accepts a request with a token and a list of services, attempting to add each service to the specified token.
109
+ The service information must include all necessary details like name and description.
110
+
111
+ Parameters:
112
+ - request (TokenServicesRequest): A model containing the authorization token and a list of services to be added.
113
+
114
+ Returns:
115
+ - A list of dictionaries, each representing a successfully added service with its details.
116
+
117
+ Raises:
118
+ - HTTPException: 400 Bad Request if any value error occurs during processing, typically due to invalid input.
119
+ - HTTPException: 500 Internal Server Error if any unexpected errors occur during the process.
120
+ """
121
+ try:
122
+ # Convert services to dicts
123
+ services_dicts = [service.dict() for service in request.services]
124
+ result = functions_doc.add_services_to_token(request.token, services_dicts)
125
+ return result
126
+ except ValueError as ve:
127
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
128
+ except Exception as e:
129
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
130
+
131
+
132
+
133
+ @app.delete("/remove_service/")
134
+ async def remove_service(request: ServiceRemovalRequest):
135
+ """
136
+ Removes a specified service associated with a token.
137
+
138
+ This endpoint allows the removal of a service by its name from a list associated with a given token.
139
+ Before attempting to remove the service, it verifies that the token exists. If the token does not exist,
140
+ a 404 error is returned. If the service name does not exist under the token or cannot be removed,
141
+ a 400 error is raised with a specific message.
142
+
143
+ Parameters:
144
+ - request (ServiceRemovalRequest): A model containing the authorization token and the name of the service to be removed.
145
+
146
+ Returns:
147
+ - A dictionary with a success status and a message indicating the outcome of the operation.
148
+
149
+ Raises:
150
+ - HTTPException: 404 Not Found if the token does not exist.
151
+ - HTTPException: 400 Bad Request if the service cannot be removed or does not exist under the token.
152
+ - HTTPException: 500 Internal Server Error for any other unexpected errors.
153
+ """
154
+ try:
155
+ # Check if the token exists in Redis
156
+ user_key = f"token:{request.token}:docservices"
157
+ if not r.exists(user_key):
158
+ raise HTTPException(status_code=404, detail="Token not found.")
159
+ # If checks pass, proceed to remove the service
160
+ manager_doc = functions_doc()
161
+ result = manager_doc.remove_service_by_name(token=request.token, servicename=request.servicename)
162
+ if result["success"]:
163
+ return {"success": True, "message": result["message"]}
164
+ else:
165
+ raise HTTPException(status_code=400, detail=result["message"])
166
+ except Exception as e:
167
+ raise HTTPException(status_code=500, detail=str(e))
168
+
169
+
170
+ @app.post("/add_and_store_document/", summary="Store a Document in Redis",
171
+ description="Stores a document as a base64 encoded string in Redis. The document is tagged with additional metadata and associated with a unique key.")
172
+ async def add_and_store_document(request:StoreDocumentServicesRequest):
173
+ """
174
+ Stores a file document in Redis as a base64 encoded string with its corresponding tags and document names.
175
+
176
+ Args:
177
+ token (str): The unique identifier for the user.
178
+ service_name (str): The service under which the document will be stored.
179
+ document_name (str): The name of the document to be stored.
180
+ file (UploadFile): The file document to be stored as a base64 encoded string.
181
+
182
+ Returns:
183
+ JSONResponse: A JSON response indicating the status of the operation ("success" or "error") and a message describing the result or the error encountered.
184
+
185
+ Raises:
186
+ HTTPException: An HTTP exception is raised with a status code of 400 or 500 depending on the type of error during the storing process.
187
+ """
188
+ try:
189
+ # Read file content as bytes
190
+ #encoded_file = await request.file.read()
191
+
192
+ # Store the document
193
+ # doc_manager = functions_doc()
194
+ response = functions_doc.add_and_store_document(token=request.token, service_name=request.service_name,document_name= request.document_name, encoded_file=request.file)
195
+ return JSONResponse(status_code=200, content={"status":"success", "message":response})
196
+
197
+ except redis.RedisError as e:
198
+ logging.error(f"Failed to store document: {e}")
199
+ return JSONResponse(status_code=500, content={"status": "error", "message": str(e)})
200
+
201
+ except Exception as e:
202
+ logging.error(f"An error occurred: {e}")
203
+ return JSONResponse(status_code=500, content={"status": "error", "message": "An unexpected error occurred"})
204
+
205
+ @app.delete("/remove_documents/", summary="Remove Multiple Documents",
206
+ description="Removes multiple documents from both the Redis store and the specified service list under a given token.")
207
+ async def remove_documents(request: RemoveDocumentsRequest):
208
+ """
209
+ Removes multiple documents from Redis storage and their references from a specified service list under a user's token.
210
+
211
+ Args:
212
+ request (RemoveDocumentsRequest): A Pydantic model that includes the token, the service name, and a list of document names to be removed.
213
+
214
+ Returns:
215
+ dict: A dictionary indicating the status of the operation ("success" or "error") and a message describing the result or error encountered.
216
+
217
+ Raises:
218
+ HTTPException: An HTTP exception is raised with status code 400 or 500, depending on the type of error during the document removal process.
219
+ """
220
+ try:
221
+ manager_doc = functions_doc()
222
+ response = manager_doc.remove_documents_from_service(token = request.token, service_name = request.service_name, document_names = request.document_names)
223
+ return response
224
+ except Exception as e:
225
+ raise HTTPException(status_code=400, detail=str(e))
226
+
227
+ @app.get("/services/", response_model=list)
228
+ def retrieve_service_names(request: services) -> list:
229
+ """
230
+ Endpoint to retrieve service names for a given token.
231
+
232
+ :param token: The unique token of the user.
233
+ :return: A list of service names associated with the token or an empty list if none are found.
234
+ """
235
+ services_key = f"token:{request.token}:docservices"
236
+ try:
237
+ existing_services = r.lrange(services_key, 0, -1)
238
+ service_names = [json.loads(service)['servicename'] for service in existing_services]
239
+ return service_names if service_names else []
240
+ except Exception as e:
241
+ raise HTTPException(status_code=500, detail=f"Failed to fetch or parse services: {str(e)}")
242
+
243
+ @app.get("/documents/", response_model=dict)
244
+ def retrieve_documents(request:ServiceRemovalRequest) -> dict:
245
+ """
246
+ Endpoint to retrieve document names from a specific service for a given token.
247
+
248
+ :param token: The unique token of the user.
249
+ :param servicename: The name of the service from which to retrieve documents.
250
+ :return: A dictionary containing a list of documents or a message if the service is not found.
251
+ """
252
+ try:
253
+ manager_doc = functions_doc()
254
+ response = manager_doc.get_documents_from_service(token = request.token, servicename = request.servicename)
255
+ return response
256
+ except Exception as e:
257
+ raise HTTPException(status_code=400, detail=str(e))
258
+
259
+ @app.get("/get_document/", response_model= Optional[bytes])
260
+ def get_document(request: Document) -> dict:
261
+ """
262
+ Retrieves a document stored as base64-encoded bytes from a specified service for a given token.
263
+
264
+ This endpoint is responsible for fetching a document by name from a specific service associated with a token.
265
+ The document is expected to be stored in base64-encoded byte format. The response will include the document
266
+ if available or return an appropriate error message if not found or in case of an error.
267
+
268
+ Parameters:
269
+ - request (Document): A model containing the authorization token, service name, and document name.
270
+
271
+ Returns:
272
+ - A JSON response containing the document in base64-encoded format or an error message.
273
+
274
+ Raises:
275
+ - HTTPException: 400 Bad Request if there's an error during the retrieval process.
276
+ """
277
+ try:
278
+ manager_doc = functions_doc()
279
+ response = manager_doc.get_document(token=request.token, service_name=request.service_name, document_name=request.document_name)
280
+ return JSONResponse(status_code=200, content=response)
281
+ except Exception as e:
282
+ raise HTTPException(status_code=400, detail=str(e))
283
+ @app.get("/get_num_pages/", response_model= dict)
284
+ def get_num_pages(request: Document) -> dict:
285
+ """
286
+ Retrieves the number of pages in a PDF document from a specified service for a given token.
287
+
288
+ This endpoint fetches a document stored as a base64-encoded string, decodes it, and counts the number of pages using PyPDF2.
289
+
290
+ Parameters:
291
+ - request (Document): A model containing the authorization token, service name, and document name.
292
+
293
+ Returns:
294
+ - A JSON response with the status, message including the number of pages, and the number of pages if successful.
295
+
296
+ Raises:
297
+ - HTTPException: 400 Bad Request if there's an error during the retrieval or processing of the document.
298
+ """
299
+ try:
300
+ manager_doc = functions_doc()
301
+ response = manager_doc.get_document(token=request.token, service_name=request.service_name, document_name=request.document_name)
302
+ if response["status"]=="success":
303
+ decoded_file = base64.b64decode(response["document"])
304
+
305
+ # Use BytesIO to create a file-like object in memory from the decoded data
306
+ pdf_file_like = io.BytesIO(decoded_file)
307
+
308
+ # Use PyPDF2 to read the file-like object and count the pages
309
+ pdf_reader = PyPDF2.PdfReader(pdf_file_like)
310
+ number_of_pages = len(pdf_reader.pages)
311
+ return JSONResponse(status_code=200, content={"status": "success", "message": f"Document has {number_of_pages} pages.", "num_pages": number_of_pages})
312
+ return JSONResponse(status_code=200, content=response)
313
+ except Exception as e:
314
+ raise HTTPException(status_code=400, detail=str(e))
315
+
316
+ @app.get("/get_chunks/", response_model=dict)
317
+ def get_chunks(request: DocumentChunks) -> dict:
318
+ """
319
+ Retrieves text chunks from a specified range of pages in a document according to the chunking method.
320
+
321
+ This endpoint decodes a stored document and processes it to extract text chunks from specified pages.
322
+ Users must specify a valid start and end page, and a chunking method. The method can be 'chunk_per_page'
323
+ for straightforward chunking by page, or 'personalize_chunking' which may use additional text parameters.
324
+
325
+ Parameters:
326
+ - request (DocumentChunks): A model containing the necessary details to fetch and chunk the document.
327
+
328
+ Returns:
329
+ - A dictionary response with the status, a message including the count of chunks, and the chunks themselves.
330
+
331
+ Raises:
332
+ - HTTPException: 400 Bad Request if there are parameter errors or processing fails.
333
+ """
334
+ try:
335
+ manager_doc = functions_doc()
336
+ response = manager_doc.get_document(token=request.token, service_name=request.service_name, document_name=request.document_name)
337
+ if response["status"] == "success":
338
+ decoded_file = base64.b64decode(response["document"])
339
+ pdf_file_like = io.BytesIO(decoded_file)
340
+ pdf_reader = PyPDF2.PdfReader(pdf_file_like)
341
+ number_of_pages = len(pdf_reader.pages)
342
+
343
+ if request.start_page < 1 or request.end_page > number_of_pages or request.start_page > request.end_page:
344
+ raise HTTPException(status_code=400, detail="Invalid start_page or end_page.")
345
+
346
+ if request.method == "chunk_per_page":
347
+ chunks = manager_doc.extract_text_from_pdf(pdf_file_like, request.start_page, request.end_page)
348
+ return {"status": "success", "message": f"Document has {len(chunks)} chunk(s).", "chunks": chunks}
349
+ elif request.method == "personalize_chunking":
350
+ # Assuming you process personalized chunking here:
351
+ personalized_chunks = manager_doc.personalize_chunking(request.split_token, pdf_file_like, request.start_page, request.end_page)
352
+ return {"status": "success", "message": f"Document has {len(personalized_chunks)} personalized chunk(s).", "chunks": personalized_chunks}
353
+ else:
354
+ raise HTTPException(status_code=400, detail="Invalid method provided.")
355
+ return response
356
+ except Exception as e:
357
+ raise HTTPException(status_code=400, detail=str(e))
358
+
359
+ @app.get("/structure_response/", response_model=dict)
360
+ def structure_response(request: DocumentRespons)-> dict:
361
+ """
362
+ Retrieves and processes chunks of a document into structured JSON based on specific criteria.
363
+
364
+ This endpoint decodes a stored document and processes it to extract and transform text chunks
365
+ from specified pages into structured JSON format. Users must specify a valid start and end page,
366
+ and a chunking method. The method can be 'chunk_per_page' for straightforward chunking by page,
367
+ or 'personalize_chunking' which may use additional text parameters. The model parameter can take
368
+ values like "gpt-3.5-turbo" or "gemini" for processing the chunks. The processing method and output
369
+ schema are specified by the user.
370
+
371
+ Parameters:
372
+ - request (DocumentRespons): A model containing the details needed to fetch, chunk, and structure the document.
373
+
374
+ Returns:
375
+ - A dictionary response with the status and the structured JSON if successful.
376
+
377
+ Raises:
378
+ - HTTPException: 400 Bad Request for parameter errors or processing issues.
379
+ - HTTPException: 500 Internal Server Error for any other unexpected errors.
380
+ """
381
+
382
+ try:
383
+ # Assuming functions_doc() returns an instance with necessary methods
384
+ manager_doc = functions_doc()
385
+ response = manager_doc.get_document(token=request.token, service_name=request.service_name, document_name=request.document_name)
386
+ if response["status"] == "success":
387
+ decoded_file = base64.b64decode(response["document"])
388
+ pdf_file_like = io.BytesIO(decoded_file)
389
+ pdf_reader = PyPDF2.PdfReader(pdf_file_like)
390
+ number_of_pages = len(pdf_reader.pages)
391
+
392
+ if request.start_page < 1 or request.end_page > number_of_pages or request.start_page > request.end_page:
393
+ raise HTTPException(status_code=400, detail="Invalid start_page or end_page.")
394
+
395
+ if request.method == "chunk_per_page":
396
+ chunks = manager_doc.extract_text_from_pdf(pdf_file_like, request.start_page, request.end_page)
397
+ json_list = process_chunks(chunks, manager_doc, request.schema, request.model,request.comment)
398
+ return {"status": "success", "json": json_list}
399
+ elif request.method == "personalize_chunking":
400
+ personalized_chunks = manager_doc.personalize_chunking(request.split_token, pdf_file_like, request.start_page, request.end_page)
401
+ json_list = process_chunks(personalized_chunks, manager_doc, request.schema, request.model,request.comment)
402
+ return {"status": "success", "json": json_list}
403
+ else:
404
+ raise HTTPException(status_code=400, detail="Invalid method provided.")
405
+ return response
406
+ except Exception as e:
407
+ raise HTTPException(status_code=500, detail=str(e))
408
+
409
+ def process_chunks(chunks, manager_doc, schema, model,comment):
410
+ json_list = []
411
+ for chunk in chunks:
412
+ try:
413
+ response = manager_doc.get_json(schema, chunk, model,comment)
414
+ except Exception as e:
415
+ response = {}
416
+ json_list.append(response)
417
+ return json_list
418
+
419
+
420
+
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pydantic
4
+ requests
5
+ typing
6
+ redis
7
+ numpy
8
+ gevent
9
+ PyPDF2
10
+ pdfplumber
11
+ openai
12
+ google-generativeai==0.7.0
srs/utils.py ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import redis
2
+ import json
3
+ from redis.commands.search.field import TagField, VectorField,TextField
4
+ from redis.commands.search.indexDefinition import IndexDefinition, IndexType
5
+ from redis.commands.search.query import Query
6
+ import logging
7
+ import numpy as np
8
+ import redis.commands.search
9
+ from typing import List, Dict, Any, Optional
10
+ import pdfplumber
11
+ import google.generativeai as genai
12
+ from openai import OpenAI
13
+ import os
14
+ import re
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger()
17
+ r = redis.Redis(host='redis', port=6379, db=0, password=None,decode_responses=True)
18
+
19
+ class functions_doc:
20
+ def __init__(self):
21
+ self.client = OpenAI(api_key="sk-proj-YrJ4mMndLNB84kUXZ6WiT3BlbkFJJaAQjjupx7nImW0iAYcX")
22
+ GENERATION_CONFIG = {
23
+ "temperature": 0.2,
24
+ "top_p": 0.75,
25
+ "max_output_tokens": 6000,
26
+ }
27
+ SAFETY_SETTINGS= [
28
+ {
29
+ "category": "HARM_CATEGORY_HARASSMENT",
30
+ "threshold": "BLOCK_MEDIUM_AND_ABOVE"
31
+ },
32
+ {
33
+ "category": "HARM_CATEGORY_HATE_SPEECH",
34
+ "threshold": "BLOCK_MEDIUM_AND_ABOVE"
35
+ },
36
+ {
37
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
38
+ "threshold": "BLOCK_MEDIUM_AND_ABOVE"
39
+ },
40
+ {
41
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
42
+ "threshold": "BLOCK_MEDIUM_AND_ABOVE"
43
+ },
44
+ ]
45
+ # Define the default schema
46
+
47
+ # Set up the model
48
+ genai.configure(api_key="AIzaSyDksULr84HdEiR5ls42xo2_Wja3De3abVw")
49
+
50
+ self.model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
51
+ generation_config=GENERATION_CONFIG,
52
+ safety_settings=SAFETY_SETTINGS)
53
+
54
+ def add_services_to_token(token: str, services: List[Dict]) -> Dict:
55
+ """
56
+ Add multiple services to a token's list of services, checking if a service with the same name already exists.
57
+ Each service is a dictionary with 'servicename' and 'modelname' (which could be default or empty).
58
+
59
+ :param token: The unique token of the user.
60
+ :param services: List of service dictionaries to add.
61
+ :return: A dictionary with a message and the list of added services.
62
+ """
63
+ services_key = f"token:{token}:docservices"
64
+ try:
65
+ existing_services = r.lrange(services_key, 0, -1)
66
+ existing_service_names = [json.loads(service)['servicename'] for service in existing_services]
67
+ except Exception as e:
68
+ raise Exception("Failed to fetch or parse existing services: " + str(e))
69
+
70
+ if not services or not isinstance(services, list):
71
+ raise ValueError("Invalid services format. It must be a list of services.")
72
+
73
+ added_services = []
74
+ for service in services:
75
+ if not isinstance(service, dict) or 'servicename' not in service:
76
+ continue
77
+
78
+ servicename = service.get('servicename')
79
+
80
+ if servicename in existing_service_names:
81
+ continue
82
+
83
+ service_info = json.dumps({"servicename": servicename, "documents": []})
84
+ try:
85
+ r.rpush(services_key, service_info)
86
+ added_services.append(servicename)
87
+ except Exception as e:
88
+ raise Exception(f"Failed to add service {servicename}: " + str(e))
89
+
90
+ if not added_services:
91
+ raise Exception("No new services were added. They may already exist or input was invalid.")
92
+
93
+ return {"message": "Services successfully added.", "added_services": added_services}
94
+
95
+ def remove_service_by_name(self, token, servicename):
96
+ """
97
+ Remove a service entry from Redis based on the servicename,
98
+ and also remove all associated documents.
99
+
100
+ Parameters:
101
+ token (str): Token to identify the user data.
102
+ servicename (str): Name of the service to be removed.
103
+
104
+ Returns:
105
+ dict: Status and message of the operation.
106
+ """
107
+ try:
108
+ # Define the user key
109
+ user_key = f"token:{token}:docservices"
110
+ # Start a Redis pipeline
111
+ pipe = r.pipeline()
112
+
113
+ # Retrieve the length of the list
114
+ list_length = r.llen(user_key)
115
+
116
+ for i in range(list_length):
117
+ # Retrieve each JSON string from the list
118
+ service_data = r.lindex(user_key, i)
119
+ if service_data:
120
+ # Convert JSON string to dictionary
121
+ data = json.loads(service_data)
122
+ if data["servicename"] == servicename:
123
+ # Remove all associated documents
124
+ documents = data["documents"]
125
+ document_names = [doc['documentname'] for doc in documents]
126
+ print(document_names)
127
+ self.remove_documents_from_service( token, servicename, document_names)
128
+ # Delete each document found
129
+ # Remove the JSON string from the list
130
+ pipe.lrem(user_key, 0, service_data)
131
+
132
+ # Execute the pipeline
133
+ pipe.execute()
134
+ return {"success": True, "message": f"Service {servicename} and all associated documents removed."}
135
+ except Exception as e:
136
+ return {"success": False, "message": str(e)}
137
+ def add_and_store_document(token: str, service_name: str, document_name: str, encoded_file: bytes) -> dict:
138
+ """
139
+ Adds a document to a specific service within a user's token and immediately stores the document in Redis.
140
+ If the document name already exists in the service, it is not appended or stored again.
141
+
142
+ :param token: The unique token of the user.
143
+ :param service_name: The name of the service to which the document will be added.
144
+ :param document_name: The name of the document to add.
145
+ :param encoded_file: The base64 encoded file to be stored.
146
+ :return: A dictionary with a message indicating the result.
147
+ """
148
+ services_key = f"token:{token}:docservices"
149
+ binary_key_key = f"token:{token}:{service_name}:binarykey"
150
+
151
+ try:
152
+ existing_services = r.lrange(services_key, 0, -1)
153
+ service_found = False
154
+ for i, service in enumerate(existing_services):
155
+ service_data = json.loads(service)
156
+ if service_data['servicename'] == service_name:
157
+ service_found = True
158
+ documents = service_data.get('documents', [])
159
+
160
+ if any(doc['documentname'] == document_name for doc in documents):
161
+ return {"message": "Document already exists in the service."}
162
+
163
+ # Auto-increment binary key
164
+ binary_key = r.incr(binary_key_key)
165
+
166
+ # Append new document info dictionary
167
+ documents.append({'documentname': document_name, 'binarykey': str(binary_key)})
168
+ service_data['documents'] = documents
169
+ updated_service = json.dumps(service_data)
170
+ r.lset(services_key, i, updated_service)
171
+
172
+ # Store the document in Redis
173
+ r.set(service_name + "_" + str(binary_key), encoded_file)
174
+ logging.info("Document stored successfully in Redis.")
175
+ return {"message": "Document successfully added and stored in the service."}
176
+
177
+ if not service_found:
178
+ return {"message": "Service not found."}
179
+
180
+ except redis.RedisError as e:
181
+ logging.error(f"Failed to store document: {e}")
182
+ return {"status": "error", "message": str(e)}
183
+
184
+ except Exception as e:
185
+ logging.error(f"An error occurred: {e}")
186
+ return {"status": "error", "message": "An unexpected error occurred"}
187
+ def personalize_chunking(self, real_text, pdf_path, start_page, end_page):
188
+ text = ""
189
+ with pdfplumber.open(pdf_path) as pdf:
190
+ # Only iterate over the desired page range
191
+ for page_number in range(start_page - 1, end_page):
192
+ page = pdf.pages[page_number]
193
+ # Extract text from the page with specific tolerances
194
+ text+= page.extract_text(x_tolerance=2, y_tolerance=4)
195
+ return text.split(real_text)
196
+
197
+ def extract_text_from_pdf(self, pdf_path, start_page, end_page):
198
+ chunks = []
199
+ with pdfplumber.open(pdf_path) as pdf:
200
+ # Only iterate over the desired page range
201
+ for page_number in range(start_page - 1, end_page):
202
+ page = pdf.pages[page_number]
203
+ # Extract text from the page with specific tolerances
204
+ text = page.extract_text(x_tolerance=2, y_tolerance=4)
205
+ chunks.append(text)
206
+ return chunks
207
+ def get_document(self, token: str, service_name: str, document_name: str) -> Optional[bytes]:
208
+ """
209
+ Retrieve a stored PDF file from Redis based on the token, service_name, and document_name.
210
+ Each document is assumed to be stored with a unique key constructed from these parameters.
211
+ """
212
+
213
+ try:
214
+
215
+ # Generate a binary key based on inputs
216
+ binary_key = self.get_binary_key(token=token, service_name=service_name, document_name=document_name)
217
+
218
+ # Retrieve the document from Redis
219
+ stored_file = r.get(service_name + "_" + str(binary_key))
220
+
221
+ if stored_file is None:
222
+ # Log and handle the case where no file is found
223
+ logging.info("No document found for the specified key.")
224
+ return {"status": "error", "message": "No document found for the specified key"}
225
+ else:
226
+ # Log success
227
+ logging.info("Document retrieved successfully from Redis.")
228
+ return {"status": "success", "message": "Document retrieved successfully from Redis.","document":stored_file}
229
+
230
+ except redis.RedisError as e:
231
+ # Log the Redis error
232
+ logging.error(f"Failed to retrieve document: {e}")
233
+ return {"status": "error", "message":f"Failed to retrieve document: {e}"}
234
+
235
+ except Exception as e:
236
+ # Handle other possible exceptions
237
+ logging.error(f"An error occurred: {e}")
238
+ return None
239
+ def get_documents_from_service(self, token: str, servicename: str) -> dict:
240
+ """
241
+ Retrieve document names from a specific service within a specific token's list of services.
242
+
243
+ :param token: The unique token of the user.
244
+ :param servicename: The name of the service from which documents will be retrieved.
245
+ :return: A dictionary with a list of documents or a message indicating the result.
246
+ """
247
+ services_key = f"token:{token}:docservices"
248
+ try:
249
+ existing_services = r.lrange(services_key, 0, -1)
250
+ for service in existing_services:
251
+ service_data = json.loads(service)
252
+ if service_data['servicename'] == servicename:
253
+ documents = service_data.get('documents', [])
254
+
255
+ return {"success": True, "documents": documents}
256
+
257
+ return {"message": "Service not found."}
258
+ except Exception as e:
259
+ raise Exception("Failed to fetch or parse services: " + str(e))
260
+
261
+ def get_binary_key(self, token:str, service_name:str, document_name:str):
262
+ result = self.get_documents_from_service(token=token, servicename=service_name)
263
+ docs = result.get("documents",[])
264
+ for doc in docs:
265
+ if doc['documentname']==document_name:
266
+ return doc['binarykey']
267
+ return None
268
+ # def remove_documents_from_service(self, token: str, service_name: str, document_names: list) -> dict:
269
+ # """
270
+ # Removes multiple PDF documents from Redis and their references from a specific service within a specific token's list of services.
271
+ # """
272
+ # try:
273
+ # services_key = f"token:{token}:docservices"
274
+ # existing_services = r.lrange(services_key, 0, -1)
275
+ # updated = False
276
+
277
+ # for i, service in enumerate(existing_services):
278
+ # service_data = json.loads(service)
279
+ # if service_data['servicename'] == service_name:
280
+ # documents = service_data.get('documents', [])
281
+ # new_documents = [doc for doc in documents if doc['documentname'] not in document_names]
282
+
283
+ # if len(documents) != len(new_documents):
284
+ # # Update the service data if any documents are removed
285
+ # service_data['documents'] = new_documents
286
+ # updated_service = json.dumps(service_data)
287
+ # r.lset(services_key, i, updated_service)
288
+ # updated = True
289
+
290
+ # # Remove documents from direct Redis storage
291
+ # for document_name in document_names:
292
+ # binary_key = self.get_binary_key(token = token, service_name = service_name, document_name =document_name)
293
+ # redis_key = service_name + "_" + str(binary_key)
294
+ # if r.exists(redis_key):
295
+ # r.delete(redis_key)
296
+ # logging.info(f"Document with key {redis_key} removed successfully from Redis.")
297
+
298
+ # if updated:
299
+ # return {"status": "success", "message": "Documents removed successfully from both Redis storage and service list."}
300
+ # else:
301
+ # return {"status": "error", "message": "No documents found in the service list or no changes were made."}
302
+
303
+ # except redis.RedisError as e:
304
+ # logging.error(f"Failed to delete documents: {e}")
305
+ # return {"status": "error", "message": str(e)}
306
+
307
+ # except Exception as e:
308
+ # logging.error(f"An error occurred: {e}")
309
+ # return {"status": "error", "message": "An unexpected error occurred"}
310
+ def remove_documents_from_service(self, token: str, service_name: str, document_names: List[str]) -> dict:
311
+ """
312
+ Removes multiple PDF documents from Redis and their references from a specific service within a specific token's list of services.
313
+ """
314
+ try:
315
+ services_key = f"token:{token}:docservices"
316
+ existing_services = r.lrange(services_key, 0, -1)
317
+ updated = False
318
+ for i, service in enumerate(existing_services):
319
+ service_data = json.loads(service)
320
+ if service_data['servicename'] == service_name:
321
+
322
+ documents = service_data.get('documents', [])
323
+ new_documents = [doc for doc in documents if doc['documentname'] not in document_names]
324
+
325
+ if len(documents) != len(new_documents):
326
+ # Remove documents from direct Redis storage
327
+ for document_name in document_names:
328
+ binary_key = self.get_binary_key(token=token, service_name=service_name, document_name=document_name)
329
+ redis_key = service_name + "_" + str(binary_key)
330
+ print("Redis key",redis_key)
331
+ if r.exists(redis_key):
332
+ r.delete(redis_key)
333
+ logging.info(f"Document with key {redis_key} removed successfully from Redis.")
334
+
335
+ # Update the service data if any documents are removed
336
+ service_data['documents'] = new_documents
337
+ updated_service = json.dumps(service_data)
338
+ r.lset(services_key, i, updated_service)
339
+ updated = True
340
+
341
+
342
+
343
+ if updated:
344
+ return {"status": "success", "message": "Documents removed successfully from both Redis storage and service list."}
345
+ else:
346
+ return {"status": "error", "message": "No documents found in the service list or no changes were made."}
347
+
348
+ except redis.RedisError as e:
349
+ logging.error(f"Failed to delete documents: {e}")
350
+ return {"status": "error", "message": str(e)}
351
+
352
+ except Exception as e:
353
+ logging.error(f"An error occurred: {e}")
354
+ return {"status": "error", "message": "An unexpected error occurred"}
355
+ def get_json(self, schema, context, model,comment):
356
+ prompt = "Your task is to extract information from context."
357
+ var = ""
358
+ if comment:
359
+ var = f"""**expilcation of keys in schema**: {comment}"""
360
+ instruction = f"""
361
+ **JSON Format (High Priority)**: Provide the output in a properly formatted JSON structure.
362
+ **Respect Schema (High Priority)**: Utilize the schema below to organize the extracted information from the context. If certain information is absent, leave the corresponding field empty.
363
+ **Error Handling**: If the context does not contain sufficient information to fulfill the requirements, return the following JSON response: {{"message": "Context lacks the desired information"}}.
364
+
365
+ ```json
366
+ {{
367
+ {schema}
368
+ }}```
369
+ {var}
370
+ """
371
+ template = f"""
372
+ {prompt}
373
+ Consider the following:
374
+ {instruction}
375
+
376
+ CONTEXT:
377
+ {context}
378
+ """
379
+ if model=="gpt-3.5-turbo":
380
+ response = self.client.chat.completions.create(
381
+ model="gpt-3.5-turbo",
382
+ messages=[{"role": "user", "content": template}
383
+ ])
384
+ pred_reponse = response.choices[0].message.content
385
+ return self.parse_json(pred_reponse)
386
+ elif model=="gemini":
387
+
388
+ response = self.model.generate_content(template)
389
+ pred_reponse = response.text
390
+ return self.parse_json(pred_reponse)
391
+ def clean_and_load_json(self, s):
392
+ # Remove comments
393
+ s = re.sub(r'#.*?\n', '', s)
394
+
395
+ # Remove trailing commas before closing brackets in lists and dictionaries
396
+ s = re.sub(r',\s*\n\s*(\]|\})', r'\1', s)
397
+ # Remove patterns like '\n ...\n'
398
+ s = re.sub(r'\n\s*\.\.\.\n', '', s)
399
+ # Remove comma before } or ]
400
+ s = re.sub(r',\s*(\]|\})', r'\1', s)
401
+ # Remove unnecessary whitespace
402
+ s = s.strip()
403
+ #print(s)
404
+ # Load the cleaned JSON string
405
+ return json.loads(s)
406
+ def parse_json(self, s):
407
+ try:
408
+ json_str = json.loads(s)
409
+ except:
410
+ # Find the index of the first occurrence of '{\n'
411
+ start_idx = s.find('{')
412
+
413
+ # Find the index of the last occurrence of '\n}'
414
+ end_idx = s.rfind('}')
415
+
416
+ # If either index is not found, raise an error
417
+ if start_idx == -1 or end_idx == -1:
418
+ raise ValueError("Could not find JSON object in the provided string.")
419
+
420
+ # Extract the JSON substring from start_idx to end_idx (inclusive) + 3 to include the closing '\n}'
421
+ json_str = s[start_idx:end_idx+1]
422
+ try:
423
+ json_str = json.loads(json_str)
424
+ except:
425
+ json_str = self.lean_and_load_json(json_str)
426
+ return json_str
427
+
428
+
429
+