AdithyaSK commited on
Commit
0bb07b7
1 Parent(s): ad6ac6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -73
app.py CHANGED
@@ -67,87 +67,182 @@ hybrid_rag = HybridColpaliRAG(
67
  IngestResult = namedtuple("IngestResult", ["status_text", "progress_table"])
68
 
69
 
70
- @spaces.GPU(duration=120)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def ingest_data(pdf_files, use_ocr, chunk_size, progress=gr.Progress()):
72
  file_paths = [pdf_file.name for pdf_file in pdf_files]
73
  total_start_time = time.time()
74
  progress_data = []
75
 
76
- # SimpleRAG
77
- yield IngestResult(
78
- status_text="Starting SimpleRAG ingestion...\n",
79
- progress_table=pd.DataFrame(progress_data),
80
- )
81
- start_time = time.time()
82
- simple_rag.index(
83
- file_paths,
84
- recursive=False,
85
- chunking_strategy=FixedTokenChunker(chunk_size=chunk_size),
86
- metadata={"source": "gradio_upload"},
87
- overwrite=True,
88
- verbose=True,
89
- ocr=use_ocr,
90
- )
91
- simple_time = time.time() - start_time
92
- progress_data.append(
93
- {"Technique": "SimpleRAG", "Time Taken (s)": f"{simple_time:.2f}"}
94
- )
95
- yield IngestResult(
96
- status_text=f"SimpleRAG ingestion complete. Time taken: {simple_time:.2f} seconds\n\n",
97
- progress_table=pd.DataFrame(progress_data),
98
- )
99
- # progress(0.25, desc="SimpleRAG complete")
100
 
101
- # VisionRAG
102
- yield IngestResult(
103
- status_text="Starting VisionRAG ingestion...\n",
104
- progress_table=pd.DataFrame(progress_data),
105
- )
106
- start_time = time.time()
107
- vision_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
108
- vision_time = time.time() - start_time
109
- progress_data.append(
110
- {"Technique": "VisionRAG", "Time Taken (s)": f"{vision_time:.2f}"}
111
- )
112
- yield IngestResult(
113
- status_text=f"VisionRAG ingestion complete. Time taken: {vision_time:.2f} seconds\n\n",
114
- progress_table=pd.DataFrame(progress_data),
115
- )
116
- # progress(0.5, desc="VisionRAG complete")
117
 
118
- # ColpaliRAG
119
- yield IngestResult(
120
- status_text="Starting ColpaliRAG ingestion...\n",
121
- progress_table=pd.DataFrame(progress_data),
122
- )
123
- start_time = time.time()
124
- colpali_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
125
- colpali_time = time.time() - start_time
126
- progress_data.append(
127
- {"Technique": "ColpaliRAG", "Time Taken (s)": f"{colpali_time:.2f}"}
128
- )
129
- yield IngestResult(
130
- status_text=f"ColpaliRAG ingestion complete. Time taken: {colpali_time:.2f} seconds\n\n",
131
- progress_table=pd.DataFrame(progress_data),
132
- )
133
- # progress(0.75, desc="ColpaliRAG complete")
134
 
135
- # HybridColpaliRAG
136
- yield IngestResult(
137
- status_text="Starting HybridColpaliRAG ingestion...\n",
138
- progress_table=pd.DataFrame(progress_data),
139
- )
140
- start_time = time.time()
141
- hybrid_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
142
- hybrid_time = time.time() - start_time
143
- progress_data.append(
144
- {"Technique": "HybridColpaliRAG", "Time Taken (s)": f"{hybrid_time:.2f}"}
145
- )
146
- yield IngestResult(
147
- status_text=f"HybridColpaliRAG ingestion complete. Time taken: {hybrid_time:.2f} seconds\n\n",
148
- progress_table=pd.DataFrame(progress_data),
149
- )
150
- # progress(1.0, desc="HybridColpaliRAG complete")
 
 
 
 
 
 
151
 
152
  total_time = time.time() - total_start_time
153
  progress_data.append({"Technique": "Total", "Time Taken (s)": f"{total_time:.2f}"})
@@ -313,6 +408,18 @@ Built on [VARAG](https://github.com/adithya-s-k/VARAG) - Vision-Augmented Retrie
313
  )
314
 
315
  with gr.Tab("Ingest Data"):
 
 
 
 
 
 
 
 
 
 
 
 
316
  pdf_input = gr.File(
317
  label="Upload PDF(s)", file_count="multiple", file_types=["pdf"]
318
  )
 
67
  IngestResult = namedtuple("IngestResult", ["status_text", "progress_table"])
68
 
69
 
70
+ # @spaces.GPU(duration=120)
71
+ # def ingest_data(pdf_files, use_ocr, chunk_size, progress=gr.Progress()):
72
+ # file_paths = [pdf_file.name for pdf_file in pdf_files]
73
+ # total_start_time = time.time()
74
+ # progress_data = []
75
+
76
+ # # SimpleRAG
77
+ # yield IngestResult(
78
+ # status_text="Starting SimpleRAG ingestion...\n",
79
+ # progress_table=pd.DataFrame(progress_data),
80
+ # )
81
+ # start_time = time.time()
82
+ # simple_rag.index(
83
+ # file_paths,
84
+ # recursive=False,
85
+ # chunking_strategy=FixedTokenChunker(chunk_size=chunk_size),
86
+ # metadata={"source": "gradio_upload"},
87
+ # overwrite=True,
88
+ # verbose=True,
89
+ # ocr=use_ocr,
90
+ # )
91
+ # simple_time = time.time() - start_time
92
+ # progress_data.append(
93
+ # {"Technique": "SimpleRAG", "Time Taken (s)": f"{simple_time:.2f}"}
94
+ # )
95
+ # yield IngestResult(
96
+ # status_text=f"SimpleRAG ingestion complete. Time taken: {simple_time:.2f} seconds\n\n",
97
+ # progress_table=pd.DataFrame(progress_data),
98
+ # )
99
+ # # progress(0.25, desc="SimpleRAG complete")
100
+
101
+ # # VisionRAG
102
+ # yield IngestResult(
103
+ # status_text="Starting VisionRAG ingestion...\n",
104
+ # progress_table=pd.DataFrame(progress_data),
105
+ # )
106
+ # start_time = time.time()
107
+ # vision_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
108
+ # vision_time = time.time() - start_time
109
+ # progress_data.append(
110
+ # {"Technique": "VisionRAG", "Time Taken (s)": f"{vision_time:.2f}"}
111
+ # )
112
+ # yield IngestResult(
113
+ # status_text=f"VisionRAG ingestion complete. Time taken: {vision_time:.2f} seconds\n\n",
114
+ # progress_table=pd.DataFrame(progress_data),
115
+ # )
116
+ # # progress(0.5, desc="VisionRAG complete")
117
+
118
+ # # ColpaliRAG
119
+ # yield IngestResult(
120
+ # status_text="Starting ColpaliRAG ingestion...\n",
121
+ # progress_table=pd.DataFrame(progress_data),
122
+ # )
123
+ # start_time = time.time()
124
+ # colpali_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
125
+ # colpali_time = time.time() - start_time
126
+ # progress_data.append(
127
+ # {"Technique": "ColpaliRAG", "Time Taken (s)": f"{colpali_time:.2f}"}
128
+ # )
129
+ # yield IngestResult(
130
+ # status_text=f"ColpaliRAG ingestion complete. Time taken: {colpali_time:.2f} seconds\n\n",
131
+ # progress_table=pd.DataFrame(progress_data),
132
+ # )
133
+ # # progress(0.75, desc="ColpaliRAG complete")
134
+
135
+ # # HybridColpaliRAG
136
+ # yield IngestResult(
137
+ # status_text="Starting HybridColpaliRAG ingestion...\n",
138
+ # progress_table=pd.DataFrame(progress_data),
139
+ # )
140
+ # start_time = time.time()
141
+ # hybrid_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
142
+ # hybrid_time = time.time() - start_time
143
+ # progress_data.append(
144
+ # {"Technique": "HybridColpaliRAG", "Time Taken (s)": f"{hybrid_time:.2f}"}
145
+ # )
146
+ # yield IngestResult(
147
+ # status_text=f"HybridColpaliRAG ingestion complete. Time taken: {hybrid_time:.2f} seconds\n\n",
148
+ # progress_table=pd.DataFrame(progress_data),
149
+ # )
150
+ # # progress(1.0, desc="HybridColpaliRAG complete")
151
+
152
+ # total_time = time.time() - total_start_time
153
+ # progress_data.append({"Technique": "Total", "Time Taken (s)": f"{total_time:.2f}"})
154
+ # yield IngestResult(
155
+ # status_text=f"Total ingestion time: {total_time:.2f} seconds",
156
+ # progress_table=pd.DataFrame(progress_data),
157
+ # )
158
+
159
+
160
  def ingest_data(pdf_files, use_ocr, chunk_size, progress=gr.Progress()):
161
  file_paths = [pdf_file.name for pdf_file in pdf_files]
162
  total_start_time = time.time()
163
  progress_data = []
164
 
165
+ @spaces.GPU(duration=120)
166
+ def ingest_simple_rag():
167
+ yield IngestResult(
168
+ status_text="Starting SimpleRAG ingestion...\n",
169
+ progress_table=pd.DataFrame(progress_data),
170
+ )
171
+ start_time = time.time()
172
+ simple_rag.index(
173
+ file_paths,
174
+ recursive=False,
175
+ chunking_strategy=FixedTokenChunker(chunk_size=chunk_size),
176
+ metadata={"source": "gradio_upload"},
177
+ overwrite=True,
178
+ verbose=True,
179
+ ocr=use_ocr,
180
+ )
181
+ simple_time = time.time() - start_time
182
+ progress_data.append(
183
+ {"Technique": "SimpleRAG", "Time Taken (s)": f"{simple_time:.2f}"}
184
+ )
185
+ yield IngestResult(
186
+ status_text=f"SimpleRAG ingestion complete. Time taken: {simple_time:.2f} seconds\n\n",
187
+ progress_table=pd.DataFrame(progress_data),
188
+ )
189
 
190
+ @spaces.GPU(duration=120)
191
+ def ingest_vision_rag():
192
+ yield IngestResult(
193
+ status_text="Starting VisionRAG ingestion...\n",
194
+ progress_table=pd.DataFrame(progress_data),
195
+ )
196
+ start_time = time.time()
197
+ vision_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
198
+ vision_time = time.time() - start_time
199
+ progress_data.append(
200
+ {"Technique": "VisionRAG", "Time Taken (s)": f"{vision_time:.2f}"}
201
+ )
202
+ yield IngestResult(
203
+ status_text=f"VisionRAG ingestion complete. Time taken: {vision_time:.2f} seconds\n\n",
204
+ progress_table=pd.DataFrame(progress_data),
205
+ )
206
 
207
+ @spaces.GPU(duration=120)
208
+ def ingest_colpali_rag():
209
+ yield IngestResult(
210
+ status_text="Starting ColpaliRAG ingestion...\n",
211
+ progress_table=pd.DataFrame(progress_data),
212
+ )
213
+ start_time = time.time()
214
+ colpali_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
215
+ colpali_time = time.time() - start_time
216
+ progress_data.append(
217
+ {"Technique": "ColpaliRAG", "Time Taken (s)": f"{colpali_time:.2f}"}
218
+ )
219
+ yield IngestResult(
220
+ status_text=f"ColpaliRAG ingestion complete. Time taken: {colpali_time:.2f} seconds\n\n",
221
+ progress_table=pd.DataFrame(progress_data),
222
+ )
223
 
224
+ @spaces.GPU(duration=120)
225
+ def ingest_hybrid_rag():
226
+ yield IngestResult(
227
+ status_text="Starting HybridColpaliRAG ingestion...\n",
228
+ progress_table=pd.DataFrame(progress_data),
229
+ )
230
+ start_time = time.time()
231
+ hybrid_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
232
+ hybrid_time = time.time() - start_time
233
+ progress_data.append(
234
+ {"Technique": "HybridColpaliRAG", "Time Taken (s)": f"{hybrid_time:.2f}"}
235
+ )
236
+ yield IngestResult(
237
+ status_text=f"HybridColpaliRAG ingestion complete. Time taken: {hybrid_time:.2f} seconds\n\n",
238
+ progress_table=pd.DataFrame(progress_data),
239
+ )
240
+
241
+ # Call each ingestion function
242
+ yield from ingest_simple_rag()
243
+ yield from ingest_vision_rag()
244
+ yield from ingest_colpali_rag()
245
+ yield from ingest_hybrid_rag()
246
 
247
  total_time = time.time() - total_start_time
248
  progress_data.append({"Technique": "Total", "Time Taken (s)": f"{total_time:.2f}"})
 
408
  )
409
 
410
  with gr.Tab("Ingest Data"):
411
+ gr.Markdown(
412
+ """
413
+ ## ⚠️ Important Note on Data Ingestion
414
+
415
+ This Space has a maximum GPU-enabled time of 120 seconds. It's recommended to try ingesting only 1 or 2 pdfs at a time.
416
+
417
+ If you want to ingest a larger amount of data, please try it out in a Google Colab notebook:
418
+
419
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/adithya-s-k/VARAG/blob/main/docs/demo.ipynb)
420
+
421
+ """
422
+ )
423
  pdf_input = gr.File(
424
  label="Upload PDF(s)", file_count="multiple", file_types=["pdf"]
425
  )