Ilyas KHIAT commited on
Commit
fe370a3
·
1 Parent(s): 98cbbb6

api first commit by me :)

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. Dockerfile +16 -0
  3. main.py +160 -0
  4. rag.py +108 -0
  5. requirements.txt +16 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ .env
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.12
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, UploadFile, File
2
+ from pydantic import BaseModel, Json
3
+ from uuid import uuid4, UUID
4
+ from typing import Optional
5
+ import pymupdf
6
+ from pinecone import Pinecone, ServerlessSpec
7
+ import os
8
+ from dotenv import load_dotenv
9
+ from rag import *
10
+ from fastapi.responses import StreamingResponse
11
+
12
+ load_dotenv()
13
+
14
+ pinecone_api_key = os.environ.get("PINECONE_API_KEY")
15
+
16
+ pc = Pinecone(api_key=pinecone_api_key)
17
+
18
+ import time
19
+
20
+ index_name = os.environ.get("INDEX_NAME") # change if desired
21
+
22
+ existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
23
+
24
+ if index_name not in existing_indexes:
25
+ pc.create_index(
26
+ name=index_name,
27
+ dimension=3072,
28
+ metric="cosine",
29
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
30
+ )
31
+ while not pc.describe_index(index_name).status["ready"]:
32
+ time.sleep(1)
33
+
34
+ index = pc.Index(index_name)
35
+
36
+ app = FastAPI()
37
+
38
+
39
+ class UserInput(BaseModel):
40
+ prompt: str
41
+ enterprise_id: str
42
+ stream: Optional[bool] = False
43
+ messages: Optional[list[dict]] = []
44
+
45
+ class EnterpriseData(BaseModel):
46
+ name: str
47
+ id: Optional[str] = None
48
+
49
+ tasks = []
50
+
51
+ @app.get("/")
52
+ def greet_json():
53
+ return {"Hello": "World!"}
54
+
55
+ @app.post("/upload")
56
+ async def upload_file(file: UploadFile, enterprise_data: Json[EnterpriseData]):
57
+ try:
58
+ # Read the uploaded file
59
+ contents = await file.read()
60
+
61
+ enterprise_name = enterprise_data.name.replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()
62
+
63
+ # Assign a new UUID if id is not provided
64
+ if enterprise_data.id is None:
65
+ enterprise_data.id = f"{enterprise_name}_{uuid4()}"
66
+
67
+ # Open the file with PyMuPDF
68
+ pdf_document = pymupdf.open(stream=contents, filetype="pdf")
69
+
70
+ # Extract all text from the document
71
+ text = ""
72
+ for page in pdf_document:
73
+ text += page.get_text()
74
+
75
+ # Split the text into chunks
76
+ text_chunks = get_text_chunks(text)
77
+
78
+ # Create a vector store
79
+ vector_store = get_vectorstore(text_chunks, filename=file.filename, file_type="pdf", namespace=enterprise_data.id, index=index)
80
+
81
+ if vector_store:
82
+ return {
83
+ "file_name":file.filename,
84
+ "enterprise_id": enterprise_data.id,
85
+ "number_of_chunks": len(text_chunks),
86
+ }
87
+ else:
88
+ raise HTTPException(status_code=500, detail="Could not create vector store")
89
+
90
+ except Exception as e:
91
+ raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
92
+
93
+ finally:
94
+ await file.close()
95
+
96
+ @app.get("/documents/{enterprise_id}")
97
+ def get_documents(enterprise_id: str):
98
+ try:
99
+ docs_names = []
100
+ for ids in index.list(namespace=enterprise_id):
101
+ for id in ids:
102
+ name_doc = "_".join(id.split("_")[:-1])
103
+ if name_doc not in docs_names:
104
+ docs_names.append(name_doc)
105
+ return docs_names
106
+ except Exception as e:
107
+ raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
108
+
109
+ @app.delete("/documents/all/{enterprise_id}")
110
+ def delete_all_documents(enterprise_id: str):
111
+ try:
112
+ index.delete(namespace=enterprise_id,delete_all=True)
113
+ return {"message": "All documents deleted"}
114
+ except Exception as e:
115
+ raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
116
+
117
+ import async_timeout
118
+ import asyncio
119
+
120
+ GENERATION_TIMEOUT_SEC = 60
121
+
122
+ async def stream_generator(response):
123
+ async with async_timeout.timeout(GENERATION_TIMEOUT_SEC):
124
+ try:
125
+ async for chunk in response:
126
+ yield {"content":chunk}
127
+ except asyncio.TimeoutError:
128
+ raise HTTPException(status_code=504, detail="Stream timed out")
129
+
130
+
131
+ @app.post("/generate-answer/")
132
+ def generate_answer(user_input: UserInput):
133
+ try:
134
+ prompt = user_input.prompt
135
+ enterprise_id = user_input.enterprise_id
136
+
137
+ context = get_retreive_answer(enterprise_id, prompt, index)
138
+ if not context:
139
+ context = "No context found"
140
+
141
+ answer = generate_response_via_langchain(prompt, model="gpt-4o",stream=user_input.stream,context = context , messages=user_input.messages)
142
+
143
+ if user_input.stream:
144
+ return StreamingResponse(answer, media_type="application/json")
145
+
146
+ return {
147
+ "prompt": prompt,
148
+ "answer": answer,
149
+ "context": context,
150
+ }
151
+
152
+ except Exception as e:
153
+ raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
154
+
155
+
156
+
157
+
158
+
159
+
160
+
rag.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain_openai import OpenAIEmbeddings
3
+ from langchain_community.vectorstores import FAISS
4
+ from langchain_pinecone import PineconeVectorStore
5
+ from langchain_core.documents import Document
6
+
7
+ from langchain_openai import ChatOpenAI
8
+ from langchain_core.output_parsers import StrOutputParser
9
+ from langchain_core.prompts import PromptTemplate
10
+
11
+
12
+
13
+ def get_text_from_content_for_doc(content):
14
+ text = ""
15
+ for page in content:
16
+ text += content[page]["texte"]
17
+ return text
18
+
19
+ def get_text_from_content_for_audio(content):
20
+ return content["transcription"]
21
+
22
+
23
+ def get_text_chunks(text):
24
+ text_splitter = RecursiveCharacterTextSplitter(
25
+ chunk_size=500, # the character length of the chunck
26
+ chunk_overlap=100, # the character length of the overlap between chuncks
27
+ length_function=len # the length function - in this case, character length (aka the python len() fn.)
28
+ )
29
+ chunks = text_splitter.split_text(text)
30
+ return chunks
31
+
32
+ def get_vectorstore(text_chunks,filename, file_type,namespace,index):
33
+ try:
34
+ embedding = OpenAIEmbeddings(model="text-embedding-3-large")
35
+ vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=namespace)
36
+
37
+ file_name = filename.split(".")[0].replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()
38
+
39
+ documents = []
40
+ uuids = []
41
+
42
+ for i, chunk in enumerate(text_chunks):
43
+ document = Document(
44
+ page_content=chunk,
45
+ metadata={"filename":filename,"file_type":file_type},
46
+ )
47
+ uuid = f"{file_name}_{i}"
48
+ uuids.append(uuid)
49
+ documents.append(document)
50
+
51
+ vector_store.add_documents(documents=documents, ids=uuids)
52
+
53
+ return True
54
+
55
+ except Exception as e:
56
+ return False
57
+
58
+ def get_retreive_answer(enterprise_id,prompt,index):
59
+ try:
60
+
61
+ embedding = OpenAIEmbeddings(model="text-embedding-3-large")
62
+ vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=enterprise_id)
63
+
64
+ retriever = vector_store.as_retriever(
65
+ search_type="similarity_score_threshold",
66
+ search_kwargs={"k": 3, "score_threshold": 0.6},
67
+ )
68
+ response = retriever.invoke(prompt)
69
+
70
+ return response
71
+
72
+ except Exception as e:
73
+ return False
74
+
75
+ def generate_response_via_langchain(query: str, stream: bool = False, model: str = "gpt-4o-mini",context:str="",messages = []) :
76
+ # Define the prompt template
77
+ template = "Sachant le context suivant: {context}, et l'historique de la conversation: {messages}, {query}"
78
+ prompt = PromptTemplate.from_template(template)
79
+
80
+ # Initialize the OpenAI LLM with the specified model
81
+ llm = ChatOpenAI(model=model)
82
+
83
+ # Create an LLM chain with the prompt and the LLM
84
+ llm_chain = prompt | llm | StrOutputParser()
85
+
86
+ if stream:
87
+ # Return a generator that yields streamed responses
88
+ return llm_chain.astream({ "query": query, "context": context, "messages": messages})
89
+
90
+ # Invoke the LLM chain and return the result
91
+ return llm_chain.invoke({"query": query})
92
+
93
+
94
+
95
+ def setup_rag(file_type,content):
96
+ if file_type == "pdf":
97
+ text = get_text_from_content_for_doc(content)
98
+ elif file_type == "audio":
99
+ text = get_text_from_content_for_audio(content)
100
+
101
+
102
+ chunks = get_text_chunks(text)
103
+
104
+ vectorstore = get_vectorstore(chunks)
105
+
106
+ return vectorstore
107
+
108
+
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-multipart
4
+ pydantic
5
+ langchain-pinecone
6
+ pinecone-notebooks
7
+ pinecone-client[grpc]
8
+ async-timeout
9
+ pymupdf
10
+ python-dotenv
11
+ typing-extensions
12
+ langchain
13
+ langchain-openai
14
+ langchain-community
15
+ langchain-pinecone
16
+