Kurian07 commited on
Commit
60fc5e8
·
verified ·
1 Parent(s): b5e531d

Upload 15 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ env/
12
+ venv/
13
+ ENV/
14
+ .venv/
15
+ *.egg
16
+ *.egg-info/
17
+ dist/
18
+ build/
19
+ *.egg-info/
20
+
21
+ # Jupyter Notebook checkpoints
22
+ .ipynb_checkpoints
23
+
24
+ # PyInstaller
25
+ # Usually these files are written by a python script from a template
26
+ *.manifest
27
+ *.spec
28
+
29
+ # Installer logs
30
+ pip-log.txt
31
+ pip-delete-this-directory.txt
32
+
33
+ # Unit test / coverage reports
34
+ htmlcov/
35
+ .tox/
36
+ .nox/
37
+ .coverage
38
+ coverage.*
39
+ .cache
40
+ nosetests.xml
41
+ coverage.xml
42
+ *.cover
43
+ *.py,cover
44
+ .hypothesis/
45
+ .pytest_cache/
46
+ .pyre/
47
+
48
+ # Translations
49
+ *.mo
50
+ *.pot
51
+
52
+ # Django stuff:
53
+ *.log
54
+ local_settings.py
55
+ db.sqlite3
56
+ db.sqlite3-journal
57
+
58
+ # Flask stuff:
59
+ instance/
60
+ .webassets-cache
61
+
62
+ # Scrapy stuff:
63
+ .scrapy
64
+
65
+ # Sphinx documentation
66
+ docs/_build/
67
+
68
+ # Pyre type checker
69
+ .pyre/
70
+
71
+ # mypy
72
+ .mypy_cache/
73
+ .dmypy.json
74
+ .dmypy.json
75
+
76
+ # environments
77
+ .env
78
+ .venv
79
+ env/
80
+ venv/
81
+ ENV/
82
+ env.bak/
83
+ venv.bak/
84
+
85
+ # IDEs and editors
86
+ .idea/
87
+ .vscode/
88
+ *.swp
89
+ *.swo
90
+ *.swn
91
+ *.sublime-workspace
92
+ *.sublime-project
93
+
94
+ # VS Code extensions
95
+ .vscode/
96
+
97
+ # macOS
98
+ .DS_Store
99
+
100
+ # Windows
101
+ Thumbs.db
102
+ ehthumbs.db
103
+ desktop.ini
104
+ $RECYCLE.BIN/
105
+
106
+ # PyCharm
107
+ .idea/
108
+ *.iml
109
+ *.ipr
110
+ *.iws
111
+
112
+ # Local config files
113
+ *.env
114
+ *.local
115
+
116
+ # System files
117
+ .Python
118
+ *~
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ChatPDF RAG
3
+ emoji: 📊
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: streamlit
7
+ sdk_version: 1.39.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: cc-by-4.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Uploaded/.file ADDED
@@ -0,0 +1 @@
 
 
1
+ chatPDF by Bipin Saha
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import uuid
4
+ import shutil
5
+ from datetime import datetime, timedelta
6
+ from dotenv import load_dotenv
7
+ from chatMode import chat_response
8
+ from modules.pdfExtractor import PdfConverter
9
+ from modules.rag import contextChunks, contextEmbeddingChroma, retrieveEmbeddingsChroma, ragQuery, similarityChroma
10
+ from sentence_transformers import SentenceTransformer
11
+ from modules.llm import GroqClient, GroqCompletion
12
+ import chromadb
13
+ import json
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+ ######## Embedding Model ########
19
+ embeddModel = SentenceTransformer(os.path.join(os.getcwd(), "embeddingModel"))
20
+ embeddModel.max_seq_length = 512
21
+ chunk_size, chunk_overlap, top_k_default = 2000, 200, 5
22
+
23
+ ######## Groq to LLM Connect ########
24
+ api_key = os.getenv("GROQ_API_KEY")
25
+ groq_client = GroqClient(api_key)
26
+ llm_model = {
27
+ "Gemma9B": "gemma2-9b-it",
28
+ "Gemma7B": "gemma-7b-it",
29
+ "LLama3-70B-Preview": "llama3-groq-70b-8192-tool-use-preview",
30
+ "LLama3.1-70B": "llama-3.1-70b-versatile",
31
+ "LLama3-70B": "llama3-70b-8192",
32
+ "LLama3.2-90B": "llama-3.2-90b-text-preview",
33
+ "Mixtral8x7B": "mixtral-8x7b-32768"
34
+ }
35
+ max_tokens = {
36
+ "Gemma9B": 8192,
37
+ "Gemma7B": 8192,
38
+ "LLama3-70B": 8192,
39
+ "LLama3.1-70B": 8000,
40
+ "LLama3-70B": 8192,
41
+ "LLama3.2-90B": 8192,
42
+ "Mixtral8x7B": 32768
43
+ }
44
+
45
+ ## Time-based cleanup settings
46
+ EXPIRATION_TIME = timedelta(hours=6)
47
+ UPLOAD_DIR = "Uploaded"
48
+ VECTOR_DB_DIR = "vectorDB"
49
+ LOG_FILE = "upload_log.json"
50
+
51
+ ## Initialize Streamlit app
52
+ st.set_page_config(page_title="ChatPDF", layout="wide")
53
+ st.markdown("<h2 style='text-align: center;'>chatPDF</h2>", unsafe_allow_html=True)
54
+
55
+ ## Function to log upload time
56
+ def log_upload_time(unique_id):
57
+ upload_time = datetime.now().isoformat()
58
+ log_entry = {unique_id: upload_time}
59
+ if os.path.exists(LOG_FILE):
60
+ with open(LOG_FILE, "r") as f:
61
+ log_data = json.load(f)
62
+ log_data.update(log_entry)
63
+ else:
64
+ log_data = log_entry
65
+
66
+ with open(LOG_FILE, "w") as f:
67
+ json.dump(log_data, f)
68
+
69
+ ## Cleanup expired files based on log
70
+ def cleanup_expired_files():
71
+ current_time = datetime.now()
72
+
73
+ # Load upload log
74
+ if os.path.exists(LOG_FILE):
75
+ with open(LOG_FILE, "r") as f:
76
+ log_data = json.load(f)
77
+
78
+ keys_to_delete = [] # List to keep track of keys to delete
79
+ # Check each entry in the log
80
+ for unique_id, upload_time in log_data.items():
81
+ upload_time_dt = datetime.fromisoformat(upload_time)
82
+ if current_time - upload_time_dt > EXPIRATION_TIME:
83
+ # Add key to the list for deletion
84
+ keys_to_delete.append(unique_id)
85
+
86
+ # Remove files if expired
87
+ pdf_file_path = os.path.join(UPLOAD_DIR, f"{unique_id}_paper.pdf")
88
+ vector_db_path = os.path.join(VECTOR_DB_DIR, unique_id)
89
+
90
+ if os.path.isfile(pdf_file_path):
91
+ os.remove(pdf_file_path)
92
+ if os.path.isdir(vector_db_path):
93
+ shutil.rmtree(vector_db_path)
94
+
95
+ # Now delete the keys from log_data after iteration
96
+ for key in keys_to_delete:
97
+ del log_data[key]
98
+
99
+ # Save updated log
100
+ with open(LOG_FILE, "w") as f:
101
+ json.dump(log_data, f)
102
+
103
+ ## Context Taking, PDF Upload, and Mode Selection
104
+ with st.sidebar:
105
+ st.title("Upload PDF:")
106
+
107
+ research_field = st.text_input("Research Field: ", key="research_field", placeholder="Enter research fields with commas")
108
+ option = ''
109
+
110
+ if not research_field:
111
+ st.info("Please enter a research field to proceed.")
112
+ option = st.selectbox('Select Mode', ('Chat', 'Graph and Table', 'Code', 'Custom Prompting'), disabled=True)
113
+ uploaded_file = st.file_uploader("", type=["pdf"], disabled=True)
114
+ else:
115
+ option = st.selectbox('Select Mode', ('Chat', 'Graph and Table', 'Code', 'Custom Prompting'))
116
+ uploaded_file = st.file_uploader("", type=["pdf"], disabled=False)
117
+
118
+ temperature = st.slider("Select Temperature", min_value=0.0, max_value=1.0, value=0.05, step=0.01)
119
+ selected_llm_model = st.selectbox("Select LLM Model", options=list(llm_model.keys()), index=3)
120
+ top_k = st.slider("Select Top K Matches", min_value=1, max_value=20, value=5)
121
+
122
+ ## Initialize unique ID, db_client, db_path, and timestamp if not already in session state
123
+ if 'db_client' not in st.session_state:
124
+ unique_id = str(uuid.uuid4())
125
+ st.session_state['unique_id'] = unique_id
126
+ db_path = os.path.join(VECTOR_DB_DIR, unique_id)
127
+ os.makedirs(db_path, exist_ok=True)
128
+ st.session_state['db_path'] = db_path
129
+ st.session_state['db_client'] = chromadb.PersistentClient(path=db_path)
130
+
131
+ # Log the upload time
132
+ log_upload_time(unique_id)
133
+
134
+ # Access session-stored variables
135
+ db_client = st.session_state['db_client']
136
+ unique_id = st.session_state['unique_id']
137
+ db_path = st.session_state['db_path']
138
+
139
+ if 'document_text' not in st.session_state:
140
+ st.session_state['document_text'] = None
141
+
142
+ if 'text_embeddings' not in st.session_state:
143
+ st.session_state['text_embeddings'] = None
144
+
145
+ ## Handle PDF Upload and Processing
146
+ if uploaded_file is not None and st.session_state['document_text'] is None:
147
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
148
+ file_path = os.path.join(UPLOAD_DIR, f"{unique_id}_paper.pdf")
149
+ with open(file_path, "wb") as file:
150
+ file.write(uploaded_file.getvalue())
151
+
152
+ document_text = PdfConverter(file_path).convert_to_markdown()
153
+ st.session_state['document_text'] = document_text
154
+
155
+ text_content_chunks = contextChunks(document_text, chunk_size, chunk_overlap)
156
+ text_contents_embeddings = contextEmbeddingChroma(embeddModel, text_content_chunks, db_client, db_path=db_path)
157
+ st.session_state['text_embeddings'] = text_contents_embeddings
158
+
159
+ if st.session_state['document_text'] and st.session_state['text_embeddings']:
160
+ document_text = st.session_state['document_text']
161
+ text_contents_embeddings = st.session_state['text_embeddings']
162
+ else:
163
+ st.stop()
164
+
165
+ q_input = st.chat_input(key="input", placeholder="Ask your question")
166
+
167
+ if q_input:
168
+ if option == "Chat":
169
+ query_embedding = ragQuery(embeddModel, q_input)
170
+ top_k_matches = similarityChroma(query_embedding, db_client, top_k)
171
+
172
+ LLMmodel = llm_model[selected_llm_model]
173
+ domain = research_field
174
+ prompt_template = q_input
175
+ user_content = top_k_matches
176
+ max_tokens = max_tokens[selected_llm_model]
177
+ print(max_tokens)
178
+ top_p = 1
179
+ stream = True
180
+ stop = None
181
+
182
+ groq_completion = GroqCompletion(groq_client, LLMmodel, domain, prompt_template, user_content, temperature, max_tokens, top_p, stream, stop)
183
+ result = groq_completion.create_completion()
184
+
185
+ with st.spinner("Processing..."):
186
+ chat_response(q_input, result)
187
+
188
+ ## Call the cleanup function periodically
189
+ cleanup_expired_files()
chatMode.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+
4
+ def chat_response(user_prompt, assistant_response):
5
+ if "chat_history" not in st.session_state:
6
+ st.session_state.chat_history = []
7
+
8
+ for message in st.session_state.chat_history:
9
+ if message["role"] == "user":
10
+ with st.chat_message("user"):
11
+ st.write(f"**You**: {message['content']}")
12
+
13
+ elif message["role"] == "assistant":
14
+ with st.chat_message("assistant"):
15
+ st.write(f"**Assistant**: {message['content']}")
16
+
17
+ if user_prompt:
18
+ st.session_state.chat_history.append({"role": "user", "content": user_prompt})
19
+ with st.chat_message("user"):
20
+ st.write(f"**You**: {user_prompt}")
21
+
22
+ with st.chat_message("assistant"):
23
+ st.write(f"**Assistant**: {assistant_response}")
24
+
25
+ st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
modules/llm.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from groq import Groq
4
+
5
+ load_dotenv()
6
+
7
+ class GroqClient:
8
+ def __init__(self, api_key):
9
+ self.client = Groq(api_key=api_key)
10
+
11
+
12
+ class GroqCompletion:
13
+ def __init__(self, client, model, domain, prompt_template, user_content, temperature, max_tokens, top_p, stream, stop):
14
+ self.client = client
15
+ self.model = model
16
+ self.domain = domain
17
+ self.prompt_template = prompt_template
18
+ self.user_content = user_content
19
+ self.temperature = temperature
20
+ self.max_tokens = max_tokens
21
+ self.top_p = top_p
22
+ self.stream = stream
23
+ self.stop = stop
24
+
25
+ def create_completion(self):
26
+ prompt = f"{self.prompt_template}\n\n{self.user_content}\n"
27
+ system_role = f"you are an helpful AI assistant in text based question answering and retriving context from given domain {self.domain}"
28
+
29
+ completion = self.client.client.chat.completions.create(
30
+ model=self.model,
31
+ messages=[
32
+ {
33
+ "role": "system",
34
+ "content": system_role
35
+ },
36
+ {
37
+ "role": "user",
38
+ "content": prompt
39
+ }
40
+ ],
41
+ temperature=self.temperature,
42
+ max_tokens=self.max_tokens,
43
+ top_p=self.top_p,
44
+ stream=self.stream,
45
+ stop=self.stop,
46
+ )
47
+
48
+ result = ""
49
+ for chunk in completion:
50
+ result += chunk.choices[0].delta.content or ""
51
+
52
+ return result
53
+
54
+
55
+
56
+ # # Example usage
57
+ # api_key = os.environ.get("GROQ_API_KEY")
58
+ # groq_client = GroqClient(api_key)
59
+
60
+ # model = "gemma2-9b-it"
61
+ # domain = "LLM"
62
+ # prompt_template = "Summarize me this content in just one line"
63
+ # user_content = """1. **Domain Adaptation and Inference**: He developed a novel semantic encoding and decoding (SEDO) algorithm that uses knowledge graphs to generate semantic labels for unlabeled data. He applied this algorithm to detect suicide risk on social media.
64
+ # 2. **Weighted Constraints Conditioned on Time-Evolving Events**: He developed a semi-deep infusion-based framework that integrates real-world knowledge as weighted constraints conditioned upon time-evolving events. He applied this framework to estimate the rise in infection rate during a crisis event.
65
+ # 3. **Matching and Ranking**: He developed a semi-deep K-IL system that models a patient's trust of GPs using knowledge of consultation history and ICD-10 graphs. He also applied this system to recommend patients to GPs."""
66
+ # temperature = 0
67
+ # max_tokens = 8192
68
+ # top_p = 1
69
+ # stream = True
70
+ # stop = None
71
+
72
+ # groq_completion = GroqCompletion(groq_client, model, domain, prompt_template, user_content, temperature, max_tokens, top_p, stream, stop)
73
+ # result = groq_completion.create_completion()
74
+ # print(result)
modules/pdfExtractor.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pymupdf4llm
3
+
4
+
5
+ class PdfConverter:
6
+ def __init__(self, pdf_file):
7
+ self.pdf_file = pdf_file
8
+ self.md_text = None
9
+
10
+ def convert_to_markdown(self):
11
+ self.md_text = pymupdf4llm.to_markdown(self.pdf_file)
12
+ return self.md_text
13
+
14
+ def save_markdown(self, output_file):
15
+ with open(output_file, 'w') as file:
16
+ file.write(self.md_text)
17
+
18
+ # Example usage
19
+ # pdf_file = os.path.join(os.getcwd(), "pdfs", "test.pdf")
20
+ # converter = PdfConverter(pdf_file)
21
+ # text = converter.convert_to_markdown()
22
+ # print(text)
modules/rag.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from sentence_transformers.util import cos_sim
5
+ from modules.pdfExtractor import PdfConverter
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.schema import Document
8
+
9
+ # model = SentenceTransformer(
10
+ # "thenlper/gte-base", # switch to en/zh for English or Chinese
11
+ # trust_remote_code=True
12
+ # )
13
+ # model.save(os.path.join(os.getcwd(), "embeddingModel"))
14
+
15
+
16
+ def contextChunks(document_text, chunk_size, chunk_overlap):
17
+ document = Document(page_content=document_text)
18
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
19
+ text_chunks = text_splitter.split_documents([document])
20
+ text_content_chunks = [chunk.page_content for chunk in text_chunks]
21
+ return text_content_chunks
22
+
23
+
24
+ def contextEmbedding(model, text_content_chunks):
25
+ text_contents_embeddings = [model.encode([text]) for text in text_content_chunks]
26
+ return text_contents_embeddings
27
+
28
+ def contextEmbeddingChroma(model, text_content_chunks, db_client, db_path):
29
+
30
+ text_contents_embeddings = [model.encode([text])[0] for text in text_content_chunks]
31
+ ids = [f"id_{i}" for i in range(len(text_content_chunks))]
32
+
33
+ collection = db_client.get_or_create_collection("embeddings_collection")
34
+
35
+ collection.add(
36
+ documents=text_content_chunks,
37
+ embeddings=text_contents_embeddings,
38
+ ids=ids # Include the generated IDs
39
+ )
40
+
41
+ return text_contents_embeddings
42
+
43
+
44
+ def retrieveEmbeddingsChroma(db_client):
45
+ collection_name = "embeddings_collection"
46
+ collection = db_client.get_collection(collection_name)
47
+
48
+ records = collection.get()
49
+ embeddings = []
50
+ text_chunks = []
51
+
52
+ if records and "documents" in records and "embeddings" in records:
53
+ text_chunks = records["documents"] or []
54
+ embeddings = records["embeddings"] or []
55
+ else:
56
+ print("No documents or embeddings found in the collection.")
57
+
58
+ return embeddings, text_chunks
59
+
60
+
61
+ def ragQuery(model, query):
62
+ return model.encode([query])
63
+
64
+ def similarity(query_embedding, text_contents_embeddings, text_content_chunks, top_k):
65
+ similarities = [(text, cos_sim(embedding, query_embedding[0]))
66
+ for text, embedding in zip(text_content_chunks, text_contents_embeddings)]
67
+
68
+ similarities_sorted = sorted(similarities, key=lambda x: x[1], reverse=True)
69
+ top_k_texts = [text for text, _ in similarities_sorted[:top_k]]
70
+
71
+ return "\n".join(f"Text Chunk <{i + 1}>\n{element}" for i, element in enumerate(top_k_texts))
72
+
73
+
74
+ def similarityChroma(query_embedding, db_client, top_k):
75
+ collection = db_client.get_collection("embeddings_collection")
76
+ results = collection.get(include=["documents", "embeddings"])
77
+
78
+ text_content_chunks = results["documents"]
79
+ text_contents_embeddings = np.array(results["embeddings"])
80
+
81
+ text_contents_embeddings = text_contents_embeddings.astype(np.float32)
82
+ query_embedding = query_embedding.astype(np.float32)
83
+
84
+ similarities = [
85
+ (text, cos_sim(embedding.reshape(1, -1), query_embedding.reshape(1, -1))[0][0])
86
+ for text, embedding in zip(text_content_chunks, text_contents_embeddings)
87
+ ]
88
+
89
+ similarities_sorted = sorted(similarities, key=lambda x: x[1], reverse=True)
90
+
91
+ top_k_texts = [text for text, _ in similarities_sorted[:top_k]]
92
+
93
+ return "\n".join(f"Text Chunk <{i + 1}>\n{element}" for i, element in enumerate(top_k_texts))
94
+
95
+
96
+
97
+
98
+ # pdf_file = os.path.join(os.getcwd(), "pdfs", "test2.pdf")
99
+ # converter = PdfConverter(pdf_file)
100
+ # document_text = converter.convert_to_markdown()
101
+
102
+ # chunk_size, chunk_overlap, top_k = 2000, 200, 5
103
+ # query = "what metric used in this paper for performance evaluation?"
104
+
105
+ # text_content_chunks = contextChunks(document_text, chunk_size, chunk_overlap)
106
+ # text_contents_embeddings = contextEmbedding(model, text_content_chunks)
107
+ # query_embedding = ragQuery(model, query)
108
+ # top_k_matches = similarity(query_embedding, text_contents_embeddings, text_content_chunks, top_k)
109
+ # print(top_k_matches[1])
modules/ragoop.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from sentence_transformers import SentenceTransformer
3
+ from sentence_transformers.util import cos_sim
4
+ from modules.pdfExtractor import PdfConverter
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.schema import Document
7
+
8
+ class EmbeddingModel:
9
+ def __init__(self, model_path=None):
10
+ if model_path is None:
11
+ self.model = SentenceTransformer(
12
+ "thenlper/gte-base", # switch to en/zh for English or Chinese
13
+ trust_remote_code=True
14
+ )
15
+ self.model.save(os.path.join(os.getcwd(), "embeddingModel"))
16
+ else:
17
+ self.model = SentenceTransformer(model_path)
18
+
19
+ self.model.max_seq_length = 512
20
+
21
+ def encode(self, texts):
22
+ return self.model.encode(texts)
23
+
24
+ class DocumentProcessor:
25
+ def __init__(self, model, chunk_size=1000, chunk_overlap=200):
26
+ self.model = model
27
+ self.chunk_size = chunk_size
28
+ self.chunk_overlap = chunk_overlap
29
+
30
+ def context_chunks(self, document_text):
31
+ document = Document(page_content=document_text)
32
+ text_splitter = RecursiveCharacterTextSplitter(
33
+ chunk_size=self.chunk_size,
34
+ chunk_overlap=self.chunk_overlap
35
+ )
36
+ text_chunks = text_splitter.split_documents([document])
37
+ text_content_chunks = [chunk.page_content for chunk in text_chunks]
38
+ return text_content_chunks
39
+
40
+ def context_embedding(self, text_content_chunks):
41
+ return [self.model.encode([text]) for text in text_content_chunks]
42
+
43
+ def rag_query(self, query):
44
+ return self.model.encode([query])
45
+
46
+ def similarity(self, query_embedding, text_contents_embeddings, text_content_chunks, top_k):
47
+ similarities = [
48
+ (text, cos_sim(embedding, query_embedding[0]))
49
+ for text, embedding in zip(text_content_chunks, text_contents_embeddings)
50
+ ]
51
+
52
+ similarities_sorted = sorted(similarities, key=lambda x: x[1], reverse=True)
53
+ top_k_texts = [text for text, _ in similarities_sorted[:top_k]]
54
+
55
+ return top_k_texts
56
+
57
+
58
+ # Example usage:
59
+ if __name__ == "__main__":
60
+ model = EmbeddingModel(model_path=os.path.join(os.getcwd(), "embeddingModel"))
61
+ processor = DocumentProcessor(model=model)
62
+
63
+ pdf_file = os.path.join(os.getcwd(), "pdfs", "test2.pdf")
64
+ converter = PdfConverter(pdf_file)
65
+ document_text = converter.convert_to_markdown()
66
+ text_chunks = processor.context_chunks(document_text)
67
+ text_embeddings = processor.context_embedding(text_chunks)
68
+
69
+ query = "what metric used in this paper for performance evaluation?"
70
+ query_embedding = processor.rag_query(query)
71
+ top_results = processor.similarity(query_embedding, text_embeddings, text_chunks, top_k=5)
72
+
73
+ print(top_results)
pdfs/.file ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pymupdf4llm==0.0.17
2
+ groq==0.11.0
3
+ chromadb==0.5.11
4
+ tiktoken==0.8.0
5
+ langchain==0.3.2
6
+ langchain-community==0.3.1
7
+ langsmith==0.1.132
8
+ sentence-transformers==3.1.1
9
+ numpy
10
+ fastapi
11
+ uvicorn
12
+ python-multipart==0.0.12
13
+ python-dotenv==1.0.1
upload_log.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"32fe6152-eb2b-4805-838c-6227bca07d94": "2024-11-01T11:27:09.540866", "d00bcf07-de71-46c4-b083-cb1baa6060e6": "2024-11-01T11:30:55.332869", "12c64717-c9f9-4f7d-9493-510c138844c3": "2024-11-01T11:34:40.360413", "ea73c9e8-f113-4a3c-8431-21ad6a7fcc9c": "2024-11-01T12:42:11.897498"}
upload_to_space.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+ import os
3
+
4
+ api = HfApi()
5
+ space_id = "Kurian07/Ultimate_llm_rag"
6
+ folder_path = "C:/Users/kuria/OneDrive/Desktop/llm/chatPDF-RAG/embeddingModel"
7
+
8
+ for root, _, files in os.walk(folder_path):
9
+ for file in files:
10
+ file_path = os.path.join(root, file)
11
+ relative_path = os.path.relpath(file_path, folder_path) # Path inside the repo
12
+
13
+ try:
14
+ # Attempt to upload each file
15
+ api.upload_file(
16
+ path_or_fileobj=file_path,
17
+ path_in_repo=relative_path,
18
+ repo_id=space_id,
19
+ repo_type="space"
20
+ )
21
+ print(f"Uploaded {relative_path} to Hugging Face Space.")
22
+ except Exception as e:
23
+ print(f"Failed to upload {relative_path}: {e}")
vectorDB/.file ADDED
@@ -0,0 +1 @@
 
 
1
+ chatPDF by Bipin Saha