Spaces:
Build error
Build error
Upload 15 files
Browse files- .gitattributes +35 -0
- .gitignore +118 -0
- README.md +13 -0
- Uploaded/.file +1 -0
- app.py +189 -0
- chatMode.py +25 -0
- modules/llm.py +74 -0
- modules/pdfExtractor.py +22 -0
- modules/rag.py +109 -0
- modules/ragoop.py +73 -0
- pdfs/.file +0 -0
- requirements.txt +13 -0
- upload_log.json +1 -0
- upload_to_space.py +23 -0
- vectorDB/.file +1 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
env/
|
12 |
+
venv/
|
13 |
+
ENV/
|
14 |
+
.venv/
|
15 |
+
*.egg
|
16 |
+
*.egg-info/
|
17 |
+
dist/
|
18 |
+
build/
|
19 |
+
*.egg-info/
|
20 |
+
|
21 |
+
# Jupyter Notebook checkpoints
|
22 |
+
.ipynb_checkpoints
|
23 |
+
|
24 |
+
# PyInstaller
|
25 |
+
# Usually these files are written by a python script from a template
|
26 |
+
*.manifest
|
27 |
+
*.spec
|
28 |
+
|
29 |
+
# Installer logs
|
30 |
+
pip-log.txt
|
31 |
+
pip-delete-this-directory.txt
|
32 |
+
|
33 |
+
# Unit test / coverage reports
|
34 |
+
htmlcov/
|
35 |
+
.tox/
|
36 |
+
.nox/
|
37 |
+
.coverage
|
38 |
+
coverage.*
|
39 |
+
.cache
|
40 |
+
nosetests.xml
|
41 |
+
coverage.xml
|
42 |
+
*.cover
|
43 |
+
*.py,cover
|
44 |
+
.hypothesis/
|
45 |
+
.pytest_cache/
|
46 |
+
.pyre/
|
47 |
+
|
48 |
+
# Translations
|
49 |
+
*.mo
|
50 |
+
*.pot
|
51 |
+
|
52 |
+
# Django stuff:
|
53 |
+
*.log
|
54 |
+
local_settings.py
|
55 |
+
db.sqlite3
|
56 |
+
db.sqlite3-journal
|
57 |
+
|
58 |
+
# Flask stuff:
|
59 |
+
instance/
|
60 |
+
.webassets-cache
|
61 |
+
|
62 |
+
# Scrapy stuff:
|
63 |
+
.scrapy
|
64 |
+
|
65 |
+
# Sphinx documentation
|
66 |
+
docs/_build/
|
67 |
+
|
68 |
+
# Pyre type checker
|
69 |
+
.pyre/
|
70 |
+
|
71 |
+
# mypy
|
72 |
+
.mypy_cache/
|
73 |
+
.dmypy.json
|
74 |
+
.dmypy.json
|
75 |
+
|
76 |
+
# environments
|
77 |
+
.env
|
78 |
+
.venv
|
79 |
+
env/
|
80 |
+
venv/
|
81 |
+
ENV/
|
82 |
+
env.bak/
|
83 |
+
venv.bak/
|
84 |
+
|
85 |
+
# IDEs and editors
|
86 |
+
.idea/
|
87 |
+
.vscode/
|
88 |
+
*.swp
|
89 |
+
*.swo
|
90 |
+
*.swn
|
91 |
+
*.sublime-workspace
|
92 |
+
*.sublime-project
|
93 |
+
|
94 |
+
# VS Code extensions
|
95 |
+
.vscode/
|
96 |
+
|
97 |
+
# macOS
|
98 |
+
.DS_Store
|
99 |
+
|
100 |
+
# Windows
|
101 |
+
Thumbs.db
|
102 |
+
ehthumbs.db
|
103 |
+
desktop.ini
|
104 |
+
$RECYCLE.BIN/
|
105 |
+
|
106 |
+
# PyCharm
|
107 |
+
.idea/
|
108 |
+
*.iml
|
109 |
+
*.ipr
|
110 |
+
*.iws
|
111 |
+
|
112 |
+
# Local config files
|
113 |
+
*.env
|
114 |
+
*.local
|
115 |
+
|
116 |
+
# System files
|
117 |
+
.Python
|
118 |
+
*~
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: ChatPDF RAG
|
3 |
+
emoji: 📊
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: red
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.39.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: cc-by-4.0
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
Uploaded/.file
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
chatPDF by Bipin Saha
|
app.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import uuid
|
4 |
+
import shutil
|
5 |
+
from datetime import datetime, timedelta
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from chatMode import chat_response
|
8 |
+
from modules.pdfExtractor import PdfConverter
|
9 |
+
from modules.rag import contextChunks, contextEmbeddingChroma, retrieveEmbeddingsChroma, ragQuery, similarityChroma
|
10 |
+
from sentence_transformers import SentenceTransformer
|
11 |
+
from modules.llm import GroqClient, GroqCompletion
|
12 |
+
import chromadb
|
13 |
+
import json
|
14 |
+
|
15 |
+
# Load environment variables
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
######## Embedding Model ########
|
19 |
+
embeddModel = SentenceTransformer(os.path.join(os.getcwd(), "embeddingModel"))
|
20 |
+
embeddModel.max_seq_length = 512
|
21 |
+
chunk_size, chunk_overlap, top_k_default = 2000, 200, 5
|
22 |
+
|
23 |
+
######## Groq to LLM Connect ########
|
24 |
+
api_key = os.getenv("GROQ_API_KEY")
|
25 |
+
groq_client = GroqClient(api_key)
|
26 |
+
llm_model = {
|
27 |
+
"Gemma9B": "gemma2-9b-it",
|
28 |
+
"Gemma7B": "gemma-7b-it",
|
29 |
+
"LLama3-70B-Preview": "llama3-groq-70b-8192-tool-use-preview",
|
30 |
+
"LLama3.1-70B": "llama-3.1-70b-versatile",
|
31 |
+
"LLama3-70B": "llama3-70b-8192",
|
32 |
+
"LLama3.2-90B": "llama-3.2-90b-text-preview",
|
33 |
+
"Mixtral8x7B": "mixtral-8x7b-32768"
|
34 |
+
}
|
35 |
+
max_tokens = {
|
36 |
+
"Gemma9B": 8192,
|
37 |
+
"Gemma7B": 8192,
|
38 |
+
"LLama3-70B": 8192,
|
39 |
+
"LLama3.1-70B": 8000,
|
40 |
+
"LLama3-70B": 8192,
|
41 |
+
"LLama3.2-90B": 8192,
|
42 |
+
"Mixtral8x7B": 32768
|
43 |
+
}
|
44 |
+
|
45 |
+
## Time-based cleanup settings
|
46 |
+
EXPIRATION_TIME = timedelta(hours=6)
|
47 |
+
UPLOAD_DIR = "Uploaded"
|
48 |
+
VECTOR_DB_DIR = "vectorDB"
|
49 |
+
LOG_FILE = "upload_log.json"
|
50 |
+
|
51 |
+
## Initialize Streamlit app
|
52 |
+
st.set_page_config(page_title="ChatPDF", layout="wide")
|
53 |
+
st.markdown("<h2 style='text-align: center;'>chatPDF</h2>", unsafe_allow_html=True)
|
54 |
+
|
55 |
+
## Function to log upload time
|
56 |
+
def log_upload_time(unique_id):
|
57 |
+
upload_time = datetime.now().isoformat()
|
58 |
+
log_entry = {unique_id: upload_time}
|
59 |
+
if os.path.exists(LOG_FILE):
|
60 |
+
with open(LOG_FILE, "r") as f:
|
61 |
+
log_data = json.load(f)
|
62 |
+
log_data.update(log_entry)
|
63 |
+
else:
|
64 |
+
log_data = log_entry
|
65 |
+
|
66 |
+
with open(LOG_FILE, "w") as f:
|
67 |
+
json.dump(log_data, f)
|
68 |
+
|
69 |
+
## Cleanup expired files based on log
|
70 |
+
def cleanup_expired_files():
|
71 |
+
current_time = datetime.now()
|
72 |
+
|
73 |
+
# Load upload log
|
74 |
+
if os.path.exists(LOG_FILE):
|
75 |
+
with open(LOG_FILE, "r") as f:
|
76 |
+
log_data = json.load(f)
|
77 |
+
|
78 |
+
keys_to_delete = [] # List to keep track of keys to delete
|
79 |
+
# Check each entry in the log
|
80 |
+
for unique_id, upload_time in log_data.items():
|
81 |
+
upload_time_dt = datetime.fromisoformat(upload_time)
|
82 |
+
if current_time - upload_time_dt > EXPIRATION_TIME:
|
83 |
+
# Add key to the list for deletion
|
84 |
+
keys_to_delete.append(unique_id)
|
85 |
+
|
86 |
+
# Remove files if expired
|
87 |
+
pdf_file_path = os.path.join(UPLOAD_DIR, f"{unique_id}_paper.pdf")
|
88 |
+
vector_db_path = os.path.join(VECTOR_DB_DIR, unique_id)
|
89 |
+
|
90 |
+
if os.path.isfile(pdf_file_path):
|
91 |
+
os.remove(pdf_file_path)
|
92 |
+
if os.path.isdir(vector_db_path):
|
93 |
+
shutil.rmtree(vector_db_path)
|
94 |
+
|
95 |
+
# Now delete the keys from log_data after iteration
|
96 |
+
for key in keys_to_delete:
|
97 |
+
del log_data[key]
|
98 |
+
|
99 |
+
# Save updated log
|
100 |
+
with open(LOG_FILE, "w") as f:
|
101 |
+
json.dump(log_data, f)
|
102 |
+
|
103 |
+
## Context Taking, PDF Upload, and Mode Selection
|
104 |
+
with st.sidebar:
|
105 |
+
st.title("Upload PDF:")
|
106 |
+
|
107 |
+
research_field = st.text_input("Research Field: ", key="research_field", placeholder="Enter research fields with commas")
|
108 |
+
option = ''
|
109 |
+
|
110 |
+
if not research_field:
|
111 |
+
st.info("Please enter a research field to proceed.")
|
112 |
+
option = st.selectbox('Select Mode', ('Chat', 'Graph and Table', 'Code', 'Custom Prompting'), disabled=True)
|
113 |
+
uploaded_file = st.file_uploader("", type=["pdf"], disabled=True)
|
114 |
+
else:
|
115 |
+
option = st.selectbox('Select Mode', ('Chat', 'Graph and Table', 'Code', 'Custom Prompting'))
|
116 |
+
uploaded_file = st.file_uploader("", type=["pdf"], disabled=False)
|
117 |
+
|
118 |
+
temperature = st.slider("Select Temperature", min_value=0.0, max_value=1.0, value=0.05, step=0.01)
|
119 |
+
selected_llm_model = st.selectbox("Select LLM Model", options=list(llm_model.keys()), index=3)
|
120 |
+
top_k = st.slider("Select Top K Matches", min_value=1, max_value=20, value=5)
|
121 |
+
|
122 |
+
## Initialize unique ID, db_client, db_path, and timestamp if not already in session state
|
123 |
+
if 'db_client' not in st.session_state:
|
124 |
+
unique_id = str(uuid.uuid4())
|
125 |
+
st.session_state['unique_id'] = unique_id
|
126 |
+
db_path = os.path.join(VECTOR_DB_DIR, unique_id)
|
127 |
+
os.makedirs(db_path, exist_ok=True)
|
128 |
+
st.session_state['db_path'] = db_path
|
129 |
+
st.session_state['db_client'] = chromadb.PersistentClient(path=db_path)
|
130 |
+
|
131 |
+
# Log the upload time
|
132 |
+
log_upload_time(unique_id)
|
133 |
+
|
134 |
+
# Access session-stored variables
|
135 |
+
db_client = st.session_state['db_client']
|
136 |
+
unique_id = st.session_state['unique_id']
|
137 |
+
db_path = st.session_state['db_path']
|
138 |
+
|
139 |
+
if 'document_text' not in st.session_state:
|
140 |
+
st.session_state['document_text'] = None
|
141 |
+
|
142 |
+
if 'text_embeddings' not in st.session_state:
|
143 |
+
st.session_state['text_embeddings'] = None
|
144 |
+
|
145 |
+
## Handle PDF Upload and Processing
|
146 |
+
if uploaded_file is not None and st.session_state['document_text'] is None:
|
147 |
+
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
148 |
+
file_path = os.path.join(UPLOAD_DIR, f"{unique_id}_paper.pdf")
|
149 |
+
with open(file_path, "wb") as file:
|
150 |
+
file.write(uploaded_file.getvalue())
|
151 |
+
|
152 |
+
document_text = PdfConverter(file_path).convert_to_markdown()
|
153 |
+
st.session_state['document_text'] = document_text
|
154 |
+
|
155 |
+
text_content_chunks = contextChunks(document_text, chunk_size, chunk_overlap)
|
156 |
+
text_contents_embeddings = contextEmbeddingChroma(embeddModel, text_content_chunks, db_client, db_path=db_path)
|
157 |
+
st.session_state['text_embeddings'] = text_contents_embeddings
|
158 |
+
|
159 |
+
if st.session_state['document_text'] and st.session_state['text_embeddings']:
|
160 |
+
document_text = st.session_state['document_text']
|
161 |
+
text_contents_embeddings = st.session_state['text_embeddings']
|
162 |
+
else:
|
163 |
+
st.stop()
|
164 |
+
|
165 |
+
q_input = st.chat_input(key="input", placeholder="Ask your question")
|
166 |
+
|
167 |
+
if q_input:
|
168 |
+
if option == "Chat":
|
169 |
+
query_embedding = ragQuery(embeddModel, q_input)
|
170 |
+
top_k_matches = similarityChroma(query_embedding, db_client, top_k)
|
171 |
+
|
172 |
+
LLMmodel = llm_model[selected_llm_model]
|
173 |
+
domain = research_field
|
174 |
+
prompt_template = q_input
|
175 |
+
user_content = top_k_matches
|
176 |
+
max_tokens = max_tokens[selected_llm_model]
|
177 |
+
print(max_tokens)
|
178 |
+
top_p = 1
|
179 |
+
stream = True
|
180 |
+
stop = None
|
181 |
+
|
182 |
+
groq_completion = GroqCompletion(groq_client, LLMmodel, domain, prompt_template, user_content, temperature, max_tokens, top_p, stream, stop)
|
183 |
+
result = groq_completion.create_completion()
|
184 |
+
|
185 |
+
with st.spinner("Processing..."):
|
186 |
+
chat_response(q_input, result)
|
187 |
+
|
188 |
+
## Call the cleanup function periodically
|
189 |
+
cleanup_expired_files()
|
chatMode.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
|
4 |
+
def chat_response(user_prompt, assistant_response):
|
5 |
+
if "chat_history" not in st.session_state:
|
6 |
+
st.session_state.chat_history = []
|
7 |
+
|
8 |
+
for message in st.session_state.chat_history:
|
9 |
+
if message["role"] == "user":
|
10 |
+
with st.chat_message("user"):
|
11 |
+
st.write(f"**You**: {message['content']}")
|
12 |
+
|
13 |
+
elif message["role"] == "assistant":
|
14 |
+
with st.chat_message("assistant"):
|
15 |
+
st.write(f"**Assistant**: {message['content']}")
|
16 |
+
|
17 |
+
if user_prompt:
|
18 |
+
st.session_state.chat_history.append({"role": "user", "content": user_prompt})
|
19 |
+
with st.chat_message("user"):
|
20 |
+
st.write(f"**You**: {user_prompt}")
|
21 |
+
|
22 |
+
with st.chat_message("assistant"):
|
23 |
+
st.write(f"**Assistant**: {assistant_response}")
|
24 |
+
|
25 |
+
st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
|
modules/llm.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from groq import Groq
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
class GroqClient:
|
8 |
+
def __init__(self, api_key):
|
9 |
+
self.client = Groq(api_key=api_key)
|
10 |
+
|
11 |
+
|
12 |
+
class GroqCompletion:
|
13 |
+
def __init__(self, client, model, domain, prompt_template, user_content, temperature, max_tokens, top_p, stream, stop):
|
14 |
+
self.client = client
|
15 |
+
self.model = model
|
16 |
+
self.domain = domain
|
17 |
+
self.prompt_template = prompt_template
|
18 |
+
self.user_content = user_content
|
19 |
+
self.temperature = temperature
|
20 |
+
self.max_tokens = max_tokens
|
21 |
+
self.top_p = top_p
|
22 |
+
self.stream = stream
|
23 |
+
self.stop = stop
|
24 |
+
|
25 |
+
def create_completion(self):
|
26 |
+
prompt = f"{self.prompt_template}\n\n{self.user_content}\n"
|
27 |
+
system_role = f"you are an helpful AI assistant in text based question answering and retriving context from given domain {self.domain}"
|
28 |
+
|
29 |
+
completion = self.client.client.chat.completions.create(
|
30 |
+
model=self.model,
|
31 |
+
messages=[
|
32 |
+
{
|
33 |
+
"role": "system",
|
34 |
+
"content": system_role
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"role": "user",
|
38 |
+
"content": prompt
|
39 |
+
}
|
40 |
+
],
|
41 |
+
temperature=self.temperature,
|
42 |
+
max_tokens=self.max_tokens,
|
43 |
+
top_p=self.top_p,
|
44 |
+
stream=self.stream,
|
45 |
+
stop=self.stop,
|
46 |
+
)
|
47 |
+
|
48 |
+
result = ""
|
49 |
+
for chunk in completion:
|
50 |
+
result += chunk.choices[0].delta.content or ""
|
51 |
+
|
52 |
+
return result
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
# # Example usage
|
57 |
+
# api_key = os.environ.get("GROQ_API_KEY")
|
58 |
+
# groq_client = GroqClient(api_key)
|
59 |
+
|
60 |
+
# model = "gemma2-9b-it"
|
61 |
+
# domain = "LLM"
|
62 |
+
# prompt_template = "Summarize me this content in just one line"
|
63 |
+
# user_content = """1. **Domain Adaptation and Inference**: He developed a novel semantic encoding and decoding (SEDO) algorithm that uses knowledge graphs to generate semantic labels for unlabeled data. He applied this algorithm to detect suicide risk on social media.
|
64 |
+
# 2. **Weighted Constraints Conditioned on Time-Evolving Events**: He developed a semi-deep infusion-based framework that integrates real-world knowledge as weighted constraints conditioned upon time-evolving events. He applied this framework to estimate the rise in infection rate during a crisis event.
|
65 |
+
# 3. **Matching and Ranking**: He developed a semi-deep K-IL system that models a patient's trust of GPs using knowledge of consultation history and ICD-10 graphs. He also applied this system to recommend patients to GPs."""
|
66 |
+
# temperature = 0
|
67 |
+
# max_tokens = 8192
|
68 |
+
# top_p = 1
|
69 |
+
# stream = True
|
70 |
+
# stop = None
|
71 |
+
|
72 |
+
# groq_completion = GroqCompletion(groq_client, model, domain, prompt_template, user_content, temperature, max_tokens, top_p, stream, stop)
|
73 |
+
# result = groq_completion.create_completion()
|
74 |
+
# print(result)
|
modules/pdfExtractor.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pymupdf4llm
|
3 |
+
|
4 |
+
|
5 |
+
class PdfConverter:
|
6 |
+
def __init__(self, pdf_file):
|
7 |
+
self.pdf_file = pdf_file
|
8 |
+
self.md_text = None
|
9 |
+
|
10 |
+
def convert_to_markdown(self):
|
11 |
+
self.md_text = pymupdf4llm.to_markdown(self.pdf_file)
|
12 |
+
return self.md_text
|
13 |
+
|
14 |
+
def save_markdown(self, output_file):
|
15 |
+
with open(output_file, 'w') as file:
|
16 |
+
file.write(self.md_text)
|
17 |
+
|
18 |
+
# Example usage
|
19 |
+
# pdf_file = os.path.join(os.getcwd(), "pdfs", "test.pdf")
|
20 |
+
# converter = PdfConverter(pdf_file)
|
21 |
+
# text = converter.convert_to_markdown()
|
22 |
+
# print(text)
|
modules/rag.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from sentence_transformers.util import cos_sim
|
5 |
+
from modules.pdfExtractor import PdfConverter
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from langchain.schema import Document
|
8 |
+
|
9 |
+
# model = SentenceTransformer(
|
10 |
+
# "thenlper/gte-base", # switch to en/zh for English or Chinese
|
11 |
+
# trust_remote_code=True
|
12 |
+
# )
|
13 |
+
# model.save(os.path.join(os.getcwd(), "embeddingModel"))
|
14 |
+
|
15 |
+
|
16 |
+
def contextChunks(document_text, chunk_size, chunk_overlap):
|
17 |
+
document = Document(page_content=document_text)
|
18 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
19 |
+
text_chunks = text_splitter.split_documents([document])
|
20 |
+
text_content_chunks = [chunk.page_content for chunk in text_chunks]
|
21 |
+
return text_content_chunks
|
22 |
+
|
23 |
+
|
24 |
+
def contextEmbedding(model, text_content_chunks):
|
25 |
+
text_contents_embeddings = [model.encode([text]) for text in text_content_chunks]
|
26 |
+
return text_contents_embeddings
|
27 |
+
|
28 |
+
def contextEmbeddingChroma(model, text_content_chunks, db_client, db_path):
|
29 |
+
|
30 |
+
text_contents_embeddings = [model.encode([text])[0] for text in text_content_chunks]
|
31 |
+
ids = [f"id_{i}" for i in range(len(text_content_chunks))]
|
32 |
+
|
33 |
+
collection = db_client.get_or_create_collection("embeddings_collection")
|
34 |
+
|
35 |
+
collection.add(
|
36 |
+
documents=text_content_chunks,
|
37 |
+
embeddings=text_contents_embeddings,
|
38 |
+
ids=ids # Include the generated IDs
|
39 |
+
)
|
40 |
+
|
41 |
+
return text_contents_embeddings
|
42 |
+
|
43 |
+
|
44 |
+
def retrieveEmbeddingsChroma(db_client):
|
45 |
+
collection_name = "embeddings_collection"
|
46 |
+
collection = db_client.get_collection(collection_name)
|
47 |
+
|
48 |
+
records = collection.get()
|
49 |
+
embeddings = []
|
50 |
+
text_chunks = []
|
51 |
+
|
52 |
+
if records and "documents" in records and "embeddings" in records:
|
53 |
+
text_chunks = records["documents"] or []
|
54 |
+
embeddings = records["embeddings"] or []
|
55 |
+
else:
|
56 |
+
print("No documents or embeddings found in the collection.")
|
57 |
+
|
58 |
+
return embeddings, text_chunks
|
59 |
+
|
60 |
+
|
61 |
+
def ragQuery(model, query):
|
62 |
+
return model.encode([query])
|
63 |
+
|
64 |
+
def similarity(query_embedding, text_contents_embeddings, text_content_chunks, top_k):
|
65 |
+
similarities = [(text, cos_sim(embedding, query_embedding[0]))
|
66 |
+
for text, embedding in zip(text_content_chunks, text_contents_embeddings)]
|
67 |
+
|
68 |
+
similarities_sorted = sorted(similarities, key=lambda x: x[1], reverse=True)
|
69 |
+
top_k_texts = [text for text, _ in similarities_sorted[:top_k]]
|
70 |
+
|
71 |
+
return "\n".join(f"Text Chunk <{i + 1}>\n{element}" for i, element in enumerate(top_k_texts))
|
72 |
+
|
73 |
+
|
74 |
+
def similarityChroma(query_embedding, db_client, top_k):
|
75 |
+
collection = db_client.get_collection("embeddings_collection")
|
76 |
+
results = collection.get(include=["documents", "embeddings"])
|
77 |
+
|
78 |
+
text_content_chunks = results["documents"]
|
79 |
+
text_contents_embeddings = np.array(results["embeddings"])
|
80 |
+
|
81 |
+
text_contents_embeddings = text_contents_embeddings.astype(np.float32)
|
82 |
+
query_embedding = query_embedding.astype(np.float32)
|
83 |
+
|
84 |
+
similarities = [
|
85 |
+
(text, cos_sim(embedding.reshape(1, -1), query_embedding.reshape(1, -1))[0][0])
|
86 |
+
for text, embedding in zip(text_content_chunks, text_contents_embeddings)
|
87 |
+
]
|
88 |
+
|
89 |
+
similarities_sorted = sorted(similarities, key=lambda x: x[1], reverse=True)
|
90 |
+
|
91 |
+
top_k_texts = [text for text, _ in similarities_sorted[:top_k]]
|
92 |
+
|
93 |
+
return "\n".join(f"Text Chunk <{i + 1}>\n{element}" for i, element in enumerate(top_k_texts))
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
# pdf_file = os.path.join(os.getcwd(), "pdfs", "test2.pdf")
|
99 |
+
# converter = PdfConverter(pdf_file)
|
100 |
+
# document_text = converter.convert_to_markdown()
|
101 |
+
|
102 |
+
# chunk_size, chunk_overlap, top_k = 2000, 200, 5
|
103 |
+
# query = "what metric used in this paper for performance evaluation?"
|
104 |
+
|
105 |
+
# text_content_chunks = contextChunks(document_text, chunk_size, chunk_overlap)
|
106 |
+
# text_contents_embeddings = contextEmbedding(model, text_content_chunks)
|
107 |
+
# query_embedding = ragQuery(model, query)
|
108 |
+
# top_k_matches = similarity(query_embedding, text_contents_embeddings, text_content_chunks, top_k)
|
109 |
+
# print(top_k_matches[1])
|
modules/ragoop.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
from sentence_transformers.util import cos_sim
|
4 |
+
from modules.pdfExtractor import PdfConverter
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
from langchain.schema import Document
|
7 |
+
|
8 |
+
class EmbeddingModel:
|
9 |
+
def __init__(self, model_path=None):
|
10 |
+
if model_path is None:
|
11 |
+
self.model = SentenceTransformer(
|
12 |
+
"thenlper/gte-base", # switch to en/zh for English or Chinese
|
13 |
+
trust_remote_code=True
|
14 |
+
)
|
15 |
+
self.model.save(os.path.join(os.getcwd(), "embeddingModel"))
|
16 |
+
else:
|
17 |
+
self.model = SentenceTransformer(model_path)
|
18 |
+
|
19 |
+
self.model.max_seq_length = 512
|
20 |
+
|
21 |
+
def encode(self, texts):
|
22 |
+
return self.model.encode(texts)
|
23 |
+
|
24 |
+
class DocumentProcessor:
|
25 |
+
def __init__(self, model, chunk_size=1000, chunk_overlap=200):
|
26 |
+
self.model = model
|
27 |
+
self.chunk_size = chunk_size
|
28 |
+
self.chunk_overlap = chunk_overlap
|
29 |
+
|
30 |
+
def context_chunks(self, document_text):
|
31 |
+
document = Document(page_content=document_text)
|
32 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
33 |
+
chunk_size=self.chunk_size,
|
34 |
+
chunk_overlap=self.chunk_overlap
|
35 |
+
)
|
36 |
+
text_chunks = text_splitter.split_documents([document])
|
37 |
+
text_content_chunks = [chunk.page_content for chunk in text_chunks]
|
38 |
+
return text_content_chunks
|
39 |
+
|
40 |
+
def context_embedding(self, text_content_chunks):
|
41 |
+
return [self.model.encode([text]) for text in text_content_chunks]
|
42 |
+
|
43 |
+
def rag_query(self, query):
|
44 |
+
return self.model.encode([query])
|
45 |
+
|
46 |
+
def similarity(self, query_embedding, text_contents_embeddings, text_content_chunks, top_k):
|
47 |
+
similarities = [
|
48 |
+
(text, cos_sim(embedding, query_embedding[0]))
|
49 |
+
for text, embedding in zip(text_content_chunks, text_contents_embeddings)
|
50 |
+
]
|
51 |
+
|
52 |
+
similarities_sorted = sorted(similarities, key=lambda x: x[1], reverse=True)
|
53 |
+
top_k_texts = [text for text, _ in similarities_sorted[:top_k]]
|
54 |
+
|
55 |
+
return top_k_texts
|
56 |
+
|
57 |
+
|
58 |
+
# Example usage:
|
59 |
+
if __name__ == "__main__":
|
60 |
+
model = EmbeddingModel(model_path=os.path.join(os.getcwd(), "embeddingModel"))
|
61 |
+
processor = DocumentProcessor(model=model)
|
62 |
+
|
63 |
+
pdf_file = os.path.join(os.getcwd(), "pdfs", "test2.pdf")
|
64 |
+
converter = PdfConverter(pdf_file)
|
65 |
+
document_text = converter.convert_to_markdown()
|
66 |
+
text_chunks = processor.context_chunks(document_text)
|
67 |
+
text_embeddings = processor.context_embedding(text_chunks)
|
68 |
+
|
69 |
+
query = "what metric used in this paper for performance evaluation?"
|
70 |
+
query_embedding = processor.rag_query(query)
|
71 |
+
top_results = processor.similarity(query_embedding, text_embeddings, text_chunks, top_k=5)
|
72 |
+
|
73 |
+
print(top_results)
|
pdfs/.file
ADDED
File without changes
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pymupdf4llm==0.0.17
|
2 |
+
groq==0.11.0
|
3 |
+
chromadb==0.5.11
|
4 |
+
tiktoken==0.8.0
|
5 |
+
langchain==0.3.2
|
6 |
+
langchain-community==0.3.1
|
7 |
+
langsmith==0.1.132
|
8 |
+
sentence-transformers==3.1.1
|
9 |
+
numpy
|
10 |
+
fastapi
|
11 |
+
uvicorn
|
12 |
+
python-multipart==0.0.12
|
13 |
+
python-dotenv==1.0.1
|
upload_log.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"32fe6152-eb2b-4805-838c-6227bca07d94": "2024-11-01T11:27:09.540866", "d00bcf07-de71-46c4-b083-cb1baa6060e6": "2024-11-01T11:30:55.332869", "12c64717-c9f9-4f7d-9493-510c138844c3": "2024-11-01T11:34:40.360413", "ea73c9e8-f113-4a3c-8431-21ad6a7fcc9c": "2024-11-01T12:42:11.897498"}
|
upload_to_space.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import HfApi
|
2 |
+
import os
|
3 |
+
|
4 |
+
api = HfApi()
|
5 |
+
space_id = "Kurian07/Ultimate_llm_rag"
|
6 |
+
folder_path = "C:/Users/kuria/OneDrive/Desktop/llm/chatPDF-RAG/embeddingModel"
|
7 |
+
|
8 |
+
for root, _, files in os.walk(folder_path):
|
9 |
+
for file in files:
|
10 |
+
file_path = os.path.join(root, file)
|
11 |
+
relative_path = os.path.relpath(file_path, folder_path) # Path inside the repo
|
12 |
+
|
13 |
+
try:
|
14 |
+
# Attempt to upload each file
|
15 |
+
api.upload_file(
|
16 |
+
path_or_fileobj=file_path,
|
17 |
+
path_in_repo=relative_path,
|
18 |
+
repo_id=space_id,
|
19 |
+
repo_type="space"
|
20 |
+
)
|
21 |
+
print(f"Uploaded {relative_path} to Hugging Face Space.")
|
22 |
+
except Exception as e:
|
23 |
+
print(f"Failed to upload {relative_path}: {e}")
|
vectorDB/.file
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
chatPDF by Bipin Saha
|