Spaces:
Running
Running
Ilyas KHIAT
commited on
Commit
·
fe370a3
1
Parent(s):
98cbbb6
api first commit by me :)
Browse files- .gitignore +2 -0
- Dockerfile +16 -0
- main.py +160 -0
- rag.py +108 -0
- requirements.txt +16 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
.env
|
Dockerfile
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.12
|
5 |
+
|
6 |
+
RUN useradd -m -u 1000 user
|
7 |
+
USER user
|
8 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
9 |
+
|
10 |
+
WORKDIR /app
|
11 |
+
|
12 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
14 |
+
|
15 |
+
COPY --chown=user . /app
|
16 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
main.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File
|
2 |
+
from pydantic import BaseModel, Json
|
3 |
+
from uuid import uuid4, UUID
|
4 |
+
from typing import Optional
|
5 |
+
import pymupdf
|
6 |
+
from pinecone import Pinecone, ServerlessSpec
|
7 |
+
import os
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from rag import *
|
10 |
+
from fastapi.responses import StreamingResponse
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
|
15 |
+
|
16 |
+
pc = Pinecone(api_key=pinecone_api_key)
|
17 |
+
|
18 |
+
import time
|
19 |
+
|
20 |
+
index_name = os.environ.get("INDEX_NAME") # change if desired
|
21 |
+
|
22 |
+
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
|
23 |
+
|
24 |
+
if index_name not in existing_indexes:
|
25 |
+
pc.create_index(
|
26 |
+
name=index_name,
|
27 |
+
dimension=3072,
|
28 |
+
metric="cosine",
|
29 |
+
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
|
30 |
+
)
|
31 |
+
while not pc.describe_index(index_name).status["ready"]:
|
32 |
+
time.sleep(1)
|
33 |
+
|
34 |
+
index = pc.Index(index_name)
|
35 |
+
|
36 |
+
app = FastAPI()
|
37 |
+
|
38 |
+
|
39 |
+
class UserInput(BaseModel):
|
40 |
+
prompt: str
|
41 |
+
enterprise_id: str
|
42 |
+
stream: Optional[bool] = False
|
43 |
+
messages: Optional[list[dict]] = []
|
44 |
+
|
45 |
+
class EnterpriseData(BaseModel):
|
46 |
+
name: str
|
47 |
+
id: Optional[str] = None
|
48 |
+
|
49 |
+
tasks = []
|
50 |
+
|
51 |
+
@app.get("/")
|
52 |
+
def greet_json():
|
53 |
+
return {"Hello": "World!"}
|
54 |
+
|
55 |
+
@app.post("/upload")
|
56 |
+
async def upload_file(file: UploadFile, enterprise_data: Json[EnterpriseData]):
|
57 |
+
try:
|
58 |
+
# Read the uploaded file
|
59 |
+
contents = await file.read()
|
60 |
+
|
61 |
+
enterprise_name = enterprise_data.name.replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()
|
62 |
+
|
63 |
+
# Assign a new UUID if id is not provided
|
64 |
+
if enterprise_data.id is None:
|
65 |
+
enterprise_data.id = f"{enterprise_name}_{uuid4()}"
|
66 |
+
|
67 |
+
# Open the file with PyMuPDF
|
68 |
+
pdf_document = pymupdf.open(stream=contents, filetype="pdf")
|
69 |
+
|
70 |
+
# Extract all text from the document
|
71 |
+
text = ""
|
72 |
+
for page in pdf_document:
|
73 |
+
text += page.get_text()
|
74 |
+
|
75 |
+
# Split the text into chunks
|
76 |
+
text_chunks = get_text_chunks(text)
|
77 |
+
|
78 |
+
# Create a vector store
|
79 |
+
vector_store = get_vectorstore(text_chunks, filename=file.filename, file_type="pdf", namespace=enterprise_data.id, index=index)
|
80 |
+
|
81 |
+
if vector_store:
|
82 |
+
return {
|
83 |
+
"file_name":file.filename,
|
84 |
+
"enterprise_id": enterprise_data.id,
|
85 |
+
"number_of_chunks": len(text_chunks),
|
86 |
+
}
|
87 |
+
else:
|
88 |
+
raise HTTPException(status_code=500, detail="Could not create vector store")
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
|
92 |
+
|
93 |
+
finally:
|
94 |
+
await file.close()
|
95 |
+
|
96 |
+
@app.get("/documents/{enterprise_id}")
|
97 |
+
def get_documents(enterprise_id: str):
|
98 |
+
try:
|
99 |
+
docs_names = []
|
100 |
+
for ids in index.list(namespace=enterprise_id):
|
101 |
+
for id in ids:
|
102 |
+
name_doc = "_".join(id.split("_")[:-1])
|
103 |
+
if name_doc not in docs_names:
|
104 |
+
docs_names.append(name_doc)
|
105 |
+
return docs_names
|
106 |
+
except Exception as e:
|
107 |
+
raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
|
108 |
+
|
109 |
+
@app.delete("/documents/all/{enterprise_id}")
|
110 |
+
def delete_all_documents(enterprise_id: str):
|
111 |
+
try:
|
112 |
+
index.delete(namespace=enterprise_id,delete_all=True)
|
113 |
+
return {"message": "All documents deleted"}
|
114 |
+
except Exception as e:
|
115 |
+
raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
|
116 |
+
|
117 |
+
import async_timeout
|
118 |
+
import asyncio
|
119 |
+
|
120 |
+
GENERATION_TIMEOUT_SEC = 60
|
121 |
+
|
122 |
+
async def stream_generator(response):
|
123 |
+
async with async_timeout.timeout(GENERATION_TIMEOUT_SEC):
|
124 |
+
try:
|
125 |
+
async for chunk in response:
|
126 |
+
yield {"content":chunk}
|
127 |
+
except asyncio.TimeoutError:
|
128 |
+
raise HTTPException(status_code=504, detail="Stream timed out")
|
129 |
+
|
130 |
+
|
131 |
+
@app.post("/generate-answer/")
|
132 |
+
def generate_answer(user_input: UserInput):
|
133 |
+
try:
|
134 |
+
prompt = user_input.prompt
|
135 |
+
enterprise_id = user_input.enterprise_id
|
136 |
+
|
137 |
+
context = get_retreive_answer(enterprise_id, prompt, index)
|
138 |
+
if not context:
|
139 |
+
context = "No context found"
|
140 |
+
|
141 |
+
answer = generate_response_via_langchain(prompt, model="gpt-4o",stream=user_input.stream,context = context , messages=user_input.messages)
|
142 |
+
|
143 |
+
if user_input.stream:
|
144 |
+
return StreamingResponse(answer, media_type="application/json")
|
145 |
+
|
146 |
+
return {
|
147 |
+
"prompt": prompt,
|
148 |
+
"answer": answer,
|
149 |
+
"context": context,
|
150 |
+
}
|
151 |
+
|
152 |
+
except Exception as e:
|
153 |
+
raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
|
rag.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
2 |
+
from langchain_openai import OpenAIEmbeddings
|
3 |
+
from langchain_community.vectorstores import FAISS
|
4 |
+
from langchain_pinecone import PineconeVectorStore
|
5 |
+
from langchain_core.documents import Document
|
6 |
+
|
7 |
+
from langchain_openai import ChatOpenAI
|
8 |
+
from langchain_core.output_parsers import StrOutputParser
|
9 |
+
from langchain_core.prompts import PromptTemplate
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
def get_text_from_content_for_doc(content):
|
14 |
+
text = ""
|
15 |
+
for page in content:
|
16 |
+
text += content[page]["texte"]
|
17 |
+
return text
|
18 |
+
|
19 |
+
def get_text_from_content_for_audio(content):
|
20 |
+
return content["transcription"]
|
21 |
+
|
22 |
+
|
23 |
+
def get_text_chunks(text):
|
24 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
25 |
+
chunk_size=500, # the character length of the chunck
|
26 |
+
chunk_overlap=100, # the character length of the overlap between chuncks
|
27 |
+
length_function=len # the length function - in this case, character length (aka the python len() fn.)
|
28 |
+
)
|
29 |
+
chunks = text_splitter.split_text(text)
|
30 |
+
return chunks
|
31 |
+
|
32 |
+
def get_vectorstore(text_chunks,filename, file_type,namespace,index):
|
33 |
+
try:
|
34 |
+
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
|
35 |
+
vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=namespace)
|
36 |
+
|
37 |
+
file_name = filename.split(".")[0].replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()
|
38 |
+
|
39 |
+
documents = []
|
40 |
+
uuids = []
|
41 |
+
|
42 |
+
for i, chunk in enumerate(text_chunks):
|
43 |
+
document = Document(
|
44 |
+
page_content=chunk,
|
45 |
+
metadata={"filename":filename,"file_type":file_type},
|
46 |
+
)
|
47 |
+
uuid = f"{file_name}_{i}"
|
48 |
+
uuids.append(uuid)
|
49 |
+
documents.append(document)
|
50 |
+
|
51 |
+
vector_store.add_documents(documents=documents, ids=uuids)
|
52 |
+
|
53 |
+
return True
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
return False
|
57 |
+
|
58 |
+
def get_retreive_answer(enterprise_id,prompt,index):
|
59 |
+
try:
|
60 |
+
|
61 |
+
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
|
62 |
+
vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=enterprise_id)
|
63 |
+
|
64 |
+
retriever = vector_store.as_retriever(
|
65 |
+
search_type="similarity_score_threshold",
|
66 |
+
search_kwargs={"k": 3, "score_threshold": 0.6},
|
67 |
+
)
|
68 |
+
response = retriever.invoke(prompt)
|
69 |
+
|
70 |
+
return response
|
71 |
+
|
72 |
+
except Exception as e:
|
73 |
+
return False
|
74 |
+
|
75 |
+
def generate_response_via_langchain(query: str, stream: bool = False, model: str = "gpt-4o-mini",context:str="",messages = []) :
|
76 |
+
# Define the prompt template
|
77 |
+
template = "Sachant le context suivant: {context}, et l'historique de la conversation: {messages}, {query}"
|
78 |
+
prompt = PromptTemplate.from_template(template)
|
79 |
+
|
80 |
+
# Initialize the OpenAI LLM with the specified model
|
81 |
+
llm = ChatOpenAI(model=model)
|
82 |
+
|
83 |
+
# Create an LLM chain with the prompt and the LLM
|
84 |
+
llm_chain = prompt | llm | StrOutputParser()
|
85 |
+
|
86 |
+
if stream:
|
87 |
+
# Return a generator that yields streamed responses
|
88 |
+
return llm_chain.astream({ "query": query, "context": context, "messages": messages})
|
89 |
+
|
90 |
+
# Invoke the LLM chain and return the result
|
91 |
+
return llm_chain.invoke({"query": query})
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
def setup_rag(file_type,content):
|
96 |
+
if file_type == "pdf":
|
97 |
+
text = get_text_from_content_for_doc(content)
|
98 |
+
elif file_type == "audio":
|
99 |
+
text = get_text_from_content_for_audio(content)
|
100 |
+
|
101 |
+
|
102 |
+
chunks = get_text_chunks(text)
|
103 |
+
|
104 |
+
vectorstore = get_vectorstore(chunks)
|
105 |
+
|
106 |
+
return vectorstore
|
107 |
+
|
108 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
python-multipart
|
4 |
+
pydantic
|
5 |
+
langchain-pinecone
|
6 |
+
pinecone-notebooks
|
7 |
+
pinecone-client[grpc]
|
8 |
+
async-timeout
|
9 |
+
pymupdf
|
10 |
+
python-dotenv
|
11 |
+
typing-extensions
|
12 |
+
langchain
|
13 |
+
langchain-openai
|
14 |
+
langchain-community
|
15 |
+
langchain-pinecone
|
16 |
+
|