Spaces:
Sleeping
Sleeping
Add .gitignore file
Browse files- .chainlit/config.toml +81 -0
- .env +1 -0
- .gitignore +25 -0
- Dockerfile +11 -0
- __pycache__/app.cpython-311.pyc +0 -0
- __pycache__/app.cpython-39.pyc +0 -0
- app.py +101 -0
- app_generic.py +147 -0
- chainlit.md +8 -0
- public/custom_styles.css +8 -0
- requirements.txt +99 -0
- requirements_1.txt +15 -0
- test.py +4 -0
- utilities/__pycache__/all_utilities.cpython-311.pyc +0 -0
- utilities/__pycache__/file_utilities.cpython-311.pyc +0 -0
- utilities/__pycache__/prompts.cpython-311.pyc +0 -0
- utilities/all_utilities.py +91 -0
- utilities/prompts.py +43 -0
.chainlit/config.toml
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
# Whether to enable telemetry (default: true). No personal data is collected.
|
3 |
+
enable_telemetry = true
|
4 |
+
|
5 |
+
# List of environment variables to be provided by each user to use the app.
|
6 |
+
user_env = []
|
7 |
+
|
8 |
+
# Duration (in seconds) during which the session is saved when the connection is lost
|
9 |
+
session_timeout = 3600
|
10 |
+
|
11 |
+
# Enable third parties caching (e.g LangChain cache)
|
12 |
+
cache = false
|
13 |
+
|
14 |
+
# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
|
15 |
+
# follow_symlink = false
|
16 |
+
|
17 |
+
[features]
|
18 |
+
# Show the prompt playground
|
19 |
+
prompt_playground = true
|
20 |
+
|
21 |
+
# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
|
22 |
+
unsafe_allow_html = false
|
23 |
+
|
24 |
+
# Process and display mathematical expressions. This can clash with "$" characters in messages.
|
25 |
+
latex = false
|
26 |
+
|
27 |
+
# Authorize users to upload files with messages
|
28 |
+
multi_modal = true
|
29 |
+
|
30 |
+
# Allows user to use speech to text
|
31 |
+
[features.speech_to_text]
|
32 |
+
enabled = false
|
33 |
+
# See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
|
34 |
+
# language = "en-US"
|
35 |
+
|
36 |
+
[UI]
|
37 |
+
# Name of the app and chatbot.
|
38 |
+
name = "Chatbot"
|
39 |
+
|
40 |
+
# Show the readme while the conversation is empty.
|
41 |
+
show_readme_as_default = true
|
42 |
+
|
43 |
+
# Description of the app and chatbot. This is used for HTML tags.
|
44 |
+
# description = ""
|
45 |
+
|
46 |
+
# Large size content are by default collapsed for a cleaner ui
|
47 |
+
default_collapse_content = true
|
48 |
+
|
49 |
+
# The default value for the expand messages settings.
|
50 |
+
default_expand_messages = false
|
51 |
+
|
52 |
+
# Hide the chain of thought details from the user in the UI.
|
53 |
+
hide_cot = false
|
54 |
+
|
55 |
+
# Link to your github repo. This will add a github button in the UI's header.
|
56 |
+
# github = ""
|
57 |
+
|
58 |
+
# Specify a CSS file that can be used to customize the user interface.
|
59 |
+
# The CSS file can be served from the public directory or via an external link.
|
60 |
+
# custom_css = "/public/test.css"
|
61 |
+
custom_css = "/public/custom_styles.css"
|
62 |
+
# Override default MUI light theme. (Check theme.ts)
|
63 |
+
[UI.theme.light]
|
64 |
+
background = "#E0F7FA" # Light Cyan for a refreshing background
|
65 |
+
paper = "#FFFFFF" # Keep the paper white for contrast
|
66 |
+
|
67 |
+
[UI.theme.light.primary]
|
68 |
+
main = "#0288D1" # A vibrant blue as the primary color
|
69 |
+
dark = "#01579B" # A deeper blue for darker elements
|
70 |
+
light = "#B3E5FC" # A light blue for accents and highlights
|
71 |
+
[UI.theme.dark]
|
72 |
+
background = "#1E3A5F" # A deep, rich blue for the background
|
73 |
+
paper = "#2C3E50" # Slightly lighter for paper elements
|
74 |
+
|
75 |
+
[UI.theme.dark.primary]
|
76 |
+
main = "#0288D1" # Same vibrant blue for consistency
|
77 |
+
dark = "#01579B" # A rich dark blue
|
78 |
+
light = "#4FC3F7" # A lighter blue for accents
|
79 |
+
|
80 |
+
[meta]
|
81 |
+
generated_by = "0.7.700"
|
.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
OPENAI_API_KEY=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
|
.gitignore
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore Python compiled files
|
2 |
+
*.pyc
|
3 |
+
*.pyo
|
4 |
+
__pycache__/
|
5 |
+
|
6 |
+
# Ignore environment variables and sensitive files
|
7 |
+
.env
|
8 |
+
.secret
|
9 |
+
|
10 |
+
# Ignore IDE-specific files (for example, VSCode)
|
11 |
+
.vscode/
|
12 |
+
.idea/
|
13 |
+
|
14 |
+
# Ignore log files
|
15 |
+
*.log
|
16 |
+
|
17 |
+
# Ignore system-specific files
|
18 |
+
.DS_Store
|
19 |
+
Thumbs.db
|
20 |
+
|
21 |
+
# Ignore the data folder
|
22 |
+
/data/
|
23 |
+
|
24 |
+
# Ignore the cache folder
|
25 |
+
/cache/
|
Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
RUN useradd -m -u 1000 user
|
3 |
+
USER user
|
4 |
+
ENV HOME=/home/user \
|
5 |
+
PATH=/home/user/.local/bin:$PATH
|
6 |
+
WORKDIR $HOME/app
|
7 |
+
COPY --chown=user . $HOME/app
|
8 |
+
COPY ./requirements.txt ~/app/requirements.txt
|
9 |
+
RUN pip install -r requirements.txt
|
10 |
+
COPY . .
|
11 |
+
CMD ["chainlit", "run", "app.py", "--port", "7860"]
|
__pycache__/app.cpython-311.pyc
ADDED
Binary file (5.42 kB). View file
|
|
__pycache__/app.cpython-39.pyc
ADDED
Binary file (729 Bytes). View file
|
|
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Import Section ###
|
2 |
+
import chainlit as cl
|
3 |
+
import os
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from langchain_openai import ChatOpenAI
|
6 |
+
from langchain_core.runnables.config import RunnableConfig
|
7 |
+
from utilities.all_utilities import process_file
|
8 |
+
from utilities.prompts import get_opening_content
|
9 |
+
|
10 |
+
################
|
11 |
+
# General code
|
12 |
+
################
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
16 |
+
|
17 |
+
# ChatOpenAI Templates
|
18 |
+
|
19 |
+
@cl.action_callback("icelandic")
|
20 |
+
async def on_action(action):
|
21 |
+
cl.user_session.set("language", "icelandic")
|
22 |
+
await cl.Message(content=f"Changing to {action.name}").send()
|
23 |
+
# Optionally remove the action button from the chatbot user interface
|
24 |
+
# await action.remove()
|
25 |
+
|
26 |
+
@cl.action_callback("english")
|
27 |
+
async def on_action(action):
|
28 |
+
cl.user_session.set("language", "english")
|
29 |
+
await cl.Message(content=f"Changing to {action.name}").send()
|
30 |
+
# Optionally remove the action button from the chatbot user interface
|
31 |
+
# await action.remove()
|
32 |
+
|
33 |
+
#############################################
|
34 |
+
### On Chat Start (Session Start) Section ###
|
35 |
+
#############################################
|
36 |
+
@cl.on_chat_start
|
37 |
+
async def on_chat_start():
|
38 |
+
|
39 |
+
actions = [
|
40 |
+
cl.Action(name="icelandic", value="icelandic", description="Switch to Icelandic"),
|
41 |
+
cl.Action(name="english", value="english", description="Switch to English")
|
42 |
+
]
|
43 |
+
|
44 |
+
|
45 |
+
await cl.Message(content="Languages", actions=actions).send()
|
46 |
+
|
47 |
+
await cl.Message(content=get_opening_content()).send()
|
48 |
+
|
49 |
+
prompt_cache_input = await cl.AskActionMessage(
|
50 |
+
content="Do you want to use Prompt Cache?",
|
51 |
+
actions=[
|
52 |
+
cl.Action(name="yes", value="yes", label="✅ Yes"),
|
53 |
+
cl.Action(name="no", value="no", label="❌ No"),
|
54 |
+
],
|
55 |
+
).send()
|
56 |
+
prompt_cache = prompt_cache_input.get("value")
|
57 |
+
files = None
|
58 |
+
# Wait for the user to upload a file
|
59 |
+
while not files:
|
60 |
+
files = await cl.AskFileMessage(
|
61 |
+
content="Please upload a .pdf file to begin processing!",
|
62 |
+
accept=["application/pdf"],
|
63 |
+
max_size_mb=20,
|
64 |
+
timeout=180,
|
65 |
+
).send()
|
66 |
+
|
67 |
+
file = files[0]
|
68 |
+
|
69 |
+
msg = cl.Message(
|
70 |
+
content=f"Processing `{file.name}`...", disable_human_feedback=True
|
71 |
+
)
|
72 |
+
await msg.send()
|
73 |
+
response = process_file(file, prompt_cache)
|
74 |
+
rag_chain = response["chain"]
|
75 |
+
retriever = response["retriever"]
|
76 |
+
|
77 |
+
msg.content = f"Processing `{file.name}` is complete."
|
78 |
+
await msg.update()
|
79 |
+
msg.content = f"You can now ask questions about `{file.name}`."
|
80 |
+
await msg.update()
|
81 |
+
cl.user_session.set("chain", rag_chain)
|
82 |
+
cl.user_session.set("retriever", retriever)
|
83 |
+
|
84 |
+
##########################
|
85 |
+
### On Message Section ###
|
86 |
+
##########################
|
87 |
+
@cl.on_message
|
88 |
+
async def main(message: cl.Message):
|
89 |
+
# Ensure that message.content is not None or empty
|
90 |
+
chain = cl.user_session.get("chain")
|
91 |
+
language = cl.user_session.get("language", "english")
|
92 |
+
msg = cl.Message(content="")
|
93 |
+
question = message.content
|
94 |
+
|
95 |
+
async for chunk in chain.astream(
|
96 |
+
{"question": question, "language": language},
|
97 |
+
config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
|
98 |
+
):
|
99 |
+
await msg.stream_token(chunk.content)
|
100 |
+
|
101 |
+
await msg.send()
|
app_generic.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import List
|
3 |
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
4 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
5 |
+
from langchain_qdrant import QdrantVectorStore
|
6 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
7 |
+
from langchain_openai import ChatOpenAI
|
8 |
+
from langchain.storage import LocalFileStore
|
9 |
+
from chainlit.types import AskFileResponse
|
10 |
+
from langchain.embeddings import CacheBackedEmbeddings
|
11 |
+
from qdrant_client.http.models import Distance, VectorParams
|
12 |
+
from qdrant_client import QdrantClient
|
13 |
+
import chainlit as cl
|
14 |
+
from operator import itemgetter
|
15 |
+
from langchain_core.prompts import ChatPromptTemplate
|
16 |
+
from langchain_core.runnables.passthrough import RunnablePassthrough
|
17 |
+
from langchain_core.runnables.config import RunnableConfig
|
18 |
+
from dotenv import load_dotenv
|
19 |
+
import uuid
|
20 |
+
|
21 |
+
load_dotenv()
|
22 |
+
|
23 |
+
|
24 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
25 |
+
|
26 |
+
rag_system_prompt_template = """\
|
27 |
+
You are a helpful assistant that uses the provided context to answer questions. Never reference this prompt, or the existance of context.
|
28 |
+
"""
|
29 |
+
|
30 |
+
rag_message_list = [
|
31 |
+
{"role" : "system", "content" : rag_system_prompt_template},
|
32 |
+
]
|
33 |
+
|
34 |
+
rag_user_prompt_template = """\
|
35 |
+
Question:
|
36 |
+
{question}
|
37 |
+
Context:
|
38 |
+
{context}
|
39 |
+
"""
|
40 |
+
|
41 |
+
chat_prompt = ChatPromptTemplate.from_messages([
|
42 |
+
("system", rag_system_prompt_template),
|
43 |
+
("human", rag_user_prompt_template)
|
44 |
+
])
|
45 |
+
|
46 |
+
chat_model = ChatOpenAI(model="gpt-4o-mini")
|
47 |
+
|
48 |
+
def process_file(file: AskFileResponse):
|
49 |
+
import tempfile
|
50 |
+
|
51 |
+
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tempfile:
|
52 |
+
with open(tempfile.name, "wb") as f:
|
53 |
+
f.write(file.content)
|
54 |
+
|
55 |
+
Loader = PyMuPDFLoader
|
56 |
+
|
57 |
+
loader = Loader(tempfile.name)
|
58 |
+
documents = loader.load()
|
59 |
+
docs = text_splitter.split_documents(documents)
|
60 |
+
for i, doc in enumerate(docs):
|
61 |
+
doc.metadata["source"] = f"source_{i}"
|
62 |
+
return docs
|
63 |
+
|
64 |
+
# Decorator: This is a Chainlit decorator that marks a function to be executed when a chat session starts
|
65 |
+
@cl.on_chat_start
|
66 |
+
async def on_chat_start():
|
67 |
+
files = None
|
68 |
+
|
69 |
+
# Wait for the user to upload a file
|
70 |
+
while files == None:
|
71 |
+
# Async method: This allows the function to pause execution while waiting for the user to upload a file,
|
72 |
+
# without blocking the entire application. It improves responsiveness and scalability.
|
73 |
+
files = await cl.AskFileMessage(
|
74 |
+
content="Please upload a PDF file to begin!",
|
75 |
+
accept=["application/pdf"],
|
76 |
+
max_size_mb=20,
|
77 |
+
timeout=180,
|
78 |
+
).send()
|
79 |
+
|
80 |
+
file = files[0]
|
81 |
+
|
82 |
+
msg = cl.Message(
|
83 |
+
content=f"Processing `{file.name}`...",
|
84 |
+
)
|
85 |
+
await msg.send()
|
86 |
+
|
87 |
+
# load the file
|
88 |
+
docs = process_file(file)
|
89 |
+
|
90 |
+
# Create a Qdrant vector store with cache backed embeddings
|
91 |
+
collection_name = f"pdf_to_parse_{uuid.uuid4()}"
|
92 |
+
client = QdrantClient(":memory:")
|
93 |
+
client.create_collection(
|
94 |
+
collection_name=collection_name,
|
95 |
+
vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
|
96 |
+
)
|
97 |
+
core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
98 |
+
store = LocalFileStore("./cache/")
|
99 |
+
# Caching: Using CacheBackedEmbeddings improves performance by storing and reusing
|
100 |
+
# previously computed embeddings, reducing API calls and processing time.
|
101 |
+
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
|
102 |
+
core_embeddings, store, namespace=core_embeddings.model
|
103 |
+
)
|
104 |
+
vectorstore = QdrantVectorStore(
|
105 |
+
client=client,
|
106 |
+
collection_name=collection_name,
|
107 |
+
embedding=cached_embedder)
|
108 |
+
vectorstore.add_documents(docs)
|
109 |
+
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})
|
110 |
+
|
111 |
+
# Create a chain that uses the QDrant vector store
|
112 |
+
# Parallelization: LCEL runnables are parallelized by default, allowing for efficient
|
113 |
+
# execution of multiple steps in the chain simultaneously, improving overall performance.
|
114 |
+
retrieval_augmented_qa_chain = (
|
115 |
+
{"context": itemgetter("question") | retriever, "question": itemgetter("question")}
|
116 |
+
| RunnablePassthrough.assign(context=itemgetter("context"))
|
117 |
+
| chat_prompt | chat_model
|
118 |
+
)
|
119 |
+
|
120 |
+
# Let the user know that the system is ready
|
121 |
+
msg.content = f"Processing `{file.name}` done. You can now ask questions!"
|
122 |
+
await msg.update()
|
123 |
+
|
124 |
+
cl.user_session.set("chain", retrieval_augmented_qa_chain)
|
125 |
+
|
126 |
+
# Decorator: This Chainlit decorator is used to rename the authors of messages in the chat interface
|
127 |
+
@cl.author_rename
|
128 |
+
def rename(orig_author: str):
|
129 |
+
rename_dict = {"ChatOpenAI": "the Generator...", "VectorStoreRetriever": "the Retriever..."}
|
130 |
+
return rename_dict.get(orig_author, orig_author)
|
131 |
+
|
132 |
+
# Decorator: This Chainlit decorator marks a function to be executed when a new message is received in the chat
|
133 |
+
@cl.on_message
|
134 |
+
async def main(message: cl.Message):
|
135 |
+
runnable = cl.user_session.get("chain")
|
136 |
+
|
137 |
+
msg = cl.Message(content="")
|
138 |
+
|
139 |
+
# Async method: Using astream allows for asynchronous streaming of the response,
|
140 |
+
# improving responsiveness and user experience by showing partial results as they become available.
|
141 |
+
async for chunk in runnable.astream(
|
142 |
+
{"question": message.content},
|
143 |
+
config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
|
144 |
+
):
|
145 |
+
await msg.stream_token(chunk.content)
|
146 |
+
|
147 |
+
await msg.send()
|
chainlit.md
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Welcome to AI Engineerint Bootcamp Cohort 4
|
2 |
+
|
3 |
+
Upload a document
|
4 |
+
|
5 |
+
Ask a question
|
6 |
+
|
7 |
+
|
8 |
+
|
public/custom_styles.css
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.message {
|
2 |
+
background-color: #E3F2FD !important; /* Light Blue background */
|
3 |
+
color: #1A237E !important; /* Dark Indigo text */
|
4 |
+
}
|
5 |
+
|
6 |
+
.MuiToolbar-root {
|
7 |
+
background-color: #b7dcf1 !important; /* Medium Blue background */
|
8 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
aiohappyeyeballs==2.4.3
|
3 |
+
aiohttp==3.10.8
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.7.0
|
6 |
+
anyio==3.7.1
|
7 |
+
async-timeout==4.0.3
|
8 |
+
asyncer==0.0.2
|
9 |
+
attrs==24.2.0
|
10 |
+
bidict==0.23.1
|
11 |
+
certifi==2024.8.30
|
12 |
+
chainlit==0.7.700
|
13 |
+
charset-normalizer==3.3.2
|
14 |
+
click==8.1.7
|
15 |
+
dataclasses-json==0.5.14
|
16 |
+
Deprecated==1.2.14
|
17 |
+
distro==1.9.0
|
18 |
+
exceptiongroup==1.2.2
|
19 |
+
fastapi==0.100.1
|
20 |
+
fastapi-socketio==0.0.10
|
21 |
+
filetype==1.2.0
|
22 |
+
frozenlist==1.4.1
|
23 |
+
googleapis-common-protos==1.65.0
|
24 |
+
greenlet==3.1.1
|
25 |
+
grpcio==1.66.2
|
26 |
+
grpcio-tools==1.62.3
|
27 |
+
h11==0.14.0
|
28 |
+
h2==4.1.0
|
29 |
+
hpack==4.0.0
|
30 |
+
httpcore==0.17.3
|
31 |
+
httpx==0.24.1
|
32 |
+
hyperframe==6.0.1
|
33 |
+
idna==3.10
|
34 |
+
importlib_metadata==8.4.0
|
35 |
+
jiter==0.5.0
|
36 |
+
jsonpatch==1.33
|
37 |
+
jsonpointer==3.0.0
|
38 |
+
langchain==0.3.0
|
39 |
+
langchain-community==0.3.0
|
40 |
+
langchain-core==0.3.1
|
41 |
+
langchain-openai==0.2.0
|
42 |
+
langchain-qdrant==0.1.4
|
43 |
+
langchain-text-splitters==0.3.0
|
44 |
+
langsmith==0.1.121
|
45 |
+
Lazify==0.4.0
|
46 |
+
marshmallow==3.22.0
|
47 |
+
multidict==6.1.0
|
48 |
+
mypy-extensions==1.0.0
|
49 |
+
nest-asyncio==1.6.0
|
50 |
+
numpy==1.26.4
|
51 |
+
openai==1.51.0
|
52 |
+
opentelemetry-api==1.27.0
|
53 |
+
opentelemetry-exporter-otlp==1.27.0
|
54 |
+
opentelemetry-exporter-otlp-proto-common==1.27.0
|
55 |
+
opentelemetry-exporter-otlp-proto-grpc==1.27.0
|
56 |
+
opentelemetry-exporter-otlp-proto-http==1.27.0
|
57 |
+
opentelemetry-instrumentation==0.48b0
|
58 |
+
opentelemetry-proto==1.27.0
|
59 |
+
opentelemetry-sdk==1.27.0
|
60 |
+
opentelemetry-semantic-conventions==0.48b0
|
61 |
+
orjson==3.10.7
|
62 |
+
packaging==23.2
|
63 |
+
portalocker==2.10.1
|
64 |
+
protobuf==4.25.5
|
65 |
+
pydantic==2.9.2
|
66 |
+
pydantic-settings==2.5.2
|
67 |
+
pydantic_core==2.23.4
|
68 |
+
PyJWT==2.9.0
|
69 |
+
PyMuPDF==1.24.10
|
70 |
+
PyMuPDFb==1.24.10
|
71 |
+
python-dotenv==1.0.1
|
72 |
+
python-engineio==4.9.1
|
73 |
+
python-graphql-client==0.4.3
|
74 |
+
python-multipart==0.0.6
|
75 |
+
python-socketio==5.11.4
|
76 |
+
PyYAML==6.0.2
|
77 |
+
qdrant-client==1.11.2
|
78 |
+
regex==2024.9.11
|
79 |
+
requests==2.32.3
|
80 |
+
simple-websocket==1.0.0
|
81 |
+
sniffio==1.3.1
|
82 |
+
SQLAlchemy==2.0.35
|
83 |
+
starlette==0.27.0
|
84 |
+
syncer==2.0.3
|
85 |
+
tenacity==8.5.0
|
86 |
+
tiktoken==0.7.0
|
87 |
+
tomli==2.0.1
|
88 |
+
tqdm==4.66.5
|
89 |
+
typing-inspect==0.9.0
|
90 |
+
typing_extensions==4.12.2
|
91 |
+
uptrace==1.26.0
|
92 |
+
urllib3==2.2.3
|
93 |
+
uvicorn==0.23.2
|
94 |
+
watchfiles==0.20.0
|
95 |
+
websockets==13.1
|
96 |
+
wrapt==1.16.0
|
97 |
+
wsproto==1.2.0
|
98 |
+
yarl==1.13.1
|
99 |
+
zipp==3.20.2
|
requirements_1.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy==1.26.4
|
2 |
+
chainlit==0.7.700 # 1.1.402
|
3 |
+
openai>=1.26.0
|
4 |
+
pymupdf==1.24.10
|
5 |
+
qdrant-client==1.11.0
|
6 |
+
langchain-text-splitters
|
7 |
+
langchain-core==0.2.27
|
8 |
+
langchain-community==0.2.10
|
9 |
+
langchain-experimental==0.0.64
|
10 |
+
langgraph-checkpoint==1.0.6
|
11 |
+
langgraph==0.2.16
|
12 |
+
langchain-qdrant==0.1.3
|
13 |
+
langchain-openai==0.1.9
|
14 |
+
pdfplumber==0.11.4
|
15 |
+
sentence-transformers==3.1.1
|
test.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utilities.all_utilities import process_file
|
2 |
+
|
3 |
+
|
4 |
+
c = process_file("/data/Starting_Agile-Mark_Shead.pdf")
|
utilities/__pycache__/all_utilities.cpython-311.pyc
ADDED
Binary file (5.46 kB). View file
|
|
utilities/__pycache__/file_utilities.cpython-311.pyc
ADDED
Binary file (3.14 kB). View file
|
|
utilities/__pycache__/prompts.cpython-311.pyc
ADDED
Binary file (1.45 kB). View file
|
|
utilities/all_utilities.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import tempfile
|
3 |
+
|
4 |
+
from chainlit.types import AskFileResponse
|
5 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
6 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
7 |
+
from qdrant_client import QdrantClient
|
8 |
+
from qdrant_client.http.models import Distance, VectorParams
|
9 |
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
10 |
+
from langchain.storage import LocalFileStore
|
11 |
+
from langchain_qdrant import QdrantVectorStore
|
12 |
+
from langchain.embeddings import CacheBackedEmbeddings
|
13 |
+
from langchain_core.prompts import ChatPromptTemplate
|
14 |
+
from langchain_core.globals import set_llm_cache
|
15 |
+
from langchain_openai import ChatOpenAI
|
16 |
+
from langchain_core.caches import InMemoryCache
|
17 |
+
from langchain_core.runnables.passthrough import RunnablePassthrough
|
18 |
+
from uuid import uuid4
|
19 |
+
from utilities.prompts import get_system_template, get_user_template
|
20 |
+
|
21 |
+
|
22 |
+
def load_file(file: AskFileResponse, chunk_size=1000, chunk_overlap=100):
|
23 |
+
import tempfile
|
24 |
+
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as tempfile:
|
25 |
+
with open(tempfile.name, "wb") as f:
|
26 |
+
f.write(file.content)
|
27 |
+
|
28 |
+
Loader = PyMuPDFLoader
|
29 |
+
|
30 |
+
loader = Loader(tempfile.name)
|
31 |
+
documents = loader.load()
|
32 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
33 |
+
docs = text_splitter.split_documents(documents)
|
34 |
+
for i, doc in enumerate(docs):
|
35 |
+
doc.metadata["source"] = f"source_{i}"
|
36 |
+
return docs
|
37 |
+
|
38 |
+
|
39 |
+
def process_embeddings(docs):
|
40 |
+
core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
41 |
+
|
42 |
+
collection_name = f"pdf_to_parse_{uuid4()}"
|
43 |
+
client = QdrantClient(":memory:")
|
44 |
+
client.create_collection(
|
45 |
+
collection_name=collection_name,
|
46 |
+
vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
|
47 |
+
)
|
48 |
+
# Adding cache!
|
49 |
+
store = LocalFileStore("./cache/")
|
50 |
+
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
|
51 |
+
core_embeddings, store, namespace=core_embeddings.model
|
52 |
+
)
|
53 |
+
# Typical QDrant Vector Store Set-up
|
54 |
+
vectorstore = QdrantVectorStore(
|
55 |
+
client=client,
|
56 |
+
collection_name=collection_name,
|
57 |
+
embedding=cached_embedder)
|
58 |
+
vectorstore.add_documents(docs)
|
59 |
+
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})
|
60 |
+
return retriever
|
61 |
+
|
62 |
+
|
63 |
+
def prepare_rag_chain(retriever, prompt_cache="yes"):
|
64 |
+
print(prompt_cache)
|
65 |
+
system_template = get_system_template()
|
66 |
+
user_template = get_user_template()
|
67 |
+
|
68 |
+
chat_prompt = ChatPromptTemplate.from_messages([
|
69 |
+
("system", system_template),
|
70 |
+
("human", user_template)
|
71 |
+
])
|
72 |
+
|
73 |
+
chat_model = ChatOpenAI(model="gpt-4o-mini")
|
74 |
+
|
75 |
+
if prompt_cache == "yes":
|
76 |
+
set_llm_cache(InMemoryCache())
|
77 |
+
|
78 |
+
from operator import itemgetter
|
79 |
+
|
80 |
+
rag_qa_chain = (
|
81 |
+
{"context": itemgetter("question") | retriever, "question": itemgetter("question"), "language": itemgetter("language")}
|
82 |
+
| RunnablePassthrough.assign(context=itemgetter("context"), language=itemgetter("language"))
|
83 |
+
| chat_prompt | chat_model
|
84 |
+
)
|
85 |
+
return rag_qa_chain
|
86 |
+
|
87 |
+
def process_file(file, prompt_cache):
|
88 |
+
docs = load_file(file)
|
89 |
+
retriever = process_embeddings(docs)
|
90 |
+
rag_chain = prepare_rag_chain(retriever, prompt_cache)
|
91 |
+
return {"chain": rag_chain, "retriever": retriever}
|
utilities/prompts.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
def get_system_template():
|
3 |
+
st = """
|
4 |
+
You are a helpful assistant who always speaks in a pleasant tone!
|
5 |
+
Use the provided context to answer the question.
|
6 |
+
Think through your answers carefully and ensure they are correct based on the provided context.
|
7 |
+
Do not reference this prompt or the context in you response.
|
8 |
+
Respond in the language provided below. If none is provided, use Italian.
|
9 |
+
"""
|
10 |
+
return st
|
11 |
+
|
12 |
+
|
13 |
+
def get_user_template():
|
14 |
+
ut = """
|
15 |
+
Question:
|
16 |
+
{question}
|
17 |
+
|
18 |
+
Language:
|
19 |
+
{language}
|
20 |
+
|
21 |
+
Context:
|
22 |
+
{context}
|
23 |
+
"""
|
24 |
+
return ut
|
25 |
+
|
26 |
+
def get_opening_content():
|
27 |
+
oc = """
|
28 |
+
Welcome!
|
29 |
+
|
30 |
+
I am Assignment 14 Chatbot.
|
31 |
+
|
32 |
+
My goal is to demonstrate an MVP app.
|
33 |
+
|
34 |
+
I have the following enabled:
|
35 |
+
- embedding cache
|
36 |
+
- prompt cache
|
37 |
+
- async processing
|
38 |
+
- user sessions
|
39 |
+
- scaleable tooling
|
40 |
+
|
41 |
+
Upload a pdf document and ask some questions about it
|
42 |
+
"""
|
43 |
+
return oc
|