arad1367 commited on
Commit
cb6b0bf
β€’
1 Parent(s): ec95781

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -45
app.py CHANGED
@@ -9,23 +9,18 @@ import torch
9
  import torchvision
10
  import subprocess
11
 
12
- # Run the commands from setup.sh to install poppler-utils
13
  def install_poppler():
14
  try:
15
  subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
16
  except FileNotFoundError:
17
  print("Poppler not found. Installing...")
18
- # Run the setup commands
19
  subprocess.run("apt-get update", shell=True)
20
  subprocess.run("apt-get install -y poppler-utils", shell=True)
21
 
22
- # Call the Poppler installation check
23
  install_poppler()
24
 
25
- # Install flash-attn if not already installed
26
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
27
 
28
- # Load the RAG Model and the Qwen2-VL-2B-Instruct model
29
  RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
30
  model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
31
  trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval()
@@ -33,24 +28,20 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_rem
33
 
34
  @spaces.GPU()
35
  def process_pdf_and_query(pdf_file, user_query):
36
- # Convert the PDF to images
37
- images = convert_from_path(pdf_file.name) # pdf_file.name gives the file path
38
  num_images = len(images)
39
 
40
- # Indexing the PDF in RAG
41
  RAG.index(
42
  input_path=pdf_file.name,
43
- index_name="image_index", # index will be saved at index_root/index_name/
44
  store_collection_with_index=False,
45
  overwrite=True
46
  )
47
 
48
- # Search the query in the RAG model
49
  results = RAG.search(user_query, k=1)
50
  if not results:
51
  return "No results found.", num_images
52
 
53
- # Retrieve the page number and process image
54
  image_index = results[0]["page_num"] - 1
55
  messages = [
56
  {
@@ -65,7 +56,6 @@ def process_pdf_and_query(pdf_file, user_query):
65
  }
66
  ]
67
 
68
- # Generate text with the Qwen model
69
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
70
  image_inputs, video_inputs = process_vision_info(messages)
71
  inputs = processor(
@@ -76,8 +66,7 @@ def process_pdf_and_query(pdf_file, user_query):
76
  return_tensors="pt",
77
  )
78
  inputs = inputs.to("cuda")
79
-
80
- # Generate the output response
81
  generated_ids = model.generate(**inputs, max_new_tokens=50)
82
  generated_ids_trimmed = [
83
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -88,36 +77,87 @@ def process_pdf_and_query(pdf_file, user_query):
88
 
89
  return output_text[0], num_images
90
 
91
-
92
- with gr.Blocks(theme='freddyaboulton/dracula_revamped') as demo:
93
- gr.HTML("<h1 style='text-align: center; font-size: 30px;'><a href='https://github.com/arad1367'>Multimodal RAG with Image Query - By Pejman Ebrahimi</a></h1>")
94
- gr.Markdown("Multimodal RAG is a technique that combines both textual and visual data to provide more accurate and comprehensive results. In this application, we use ColPali, a multimodal retriever, and Byaldi, a new library by answer.ai to easily use ColPali. We also use Qwen/Qwen2-VL-2B-Instruct LLM.")
95
-
96
- pdf_input = gr.File(label="Upload PDF")
97
- query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
98
- output_text = gr.Textbox(label="Model Answer")
99
- output_images = gr.Textbox(label="Number of Images in PDF")
100
-
101
- submit_btn = gr.Button("Submit", variant="primary")
102
- submit_btn.style(full_width=True)
103
-
104
- duplicate_btn = gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
105
- duplicate_btn.style(full_width=True)
106
-
107
- submit_btn.click(fn=process_pdf_and_query, inputs=[pdf_input, query_input], outputs=[output_text, output_images])
108
-
109
- footer = """
110
- <div style="text-align: center; margin-top: 20px;">
111
- <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
112
- <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
113
- <a href="https://arad1367.pythonanywhere.com/" target="_blank">Live demo of my PhD defense</a> |
114
- <a href="https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct" target="_blank">Qwen/Qwen2-VL-2B-Instruct</a> |
115
- <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
116
- <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
117
- <br>
118
- Made with πŸ’– by Pejman Ebrahimi
119
- </div>
120
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  gr.HTML(footer)
122
 
123
- demo.launch(debug=True)
 
9
  import torchvision
10
  import subprocess
11
 
 
12
  def install_poppler():
13
  try:
14
  subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
15
  except FileNotFoundError:
16
  print("Poppler not found. Installing...")
 
17
  subprocess.run("apt-get update", shell=True)
18
  subprocess.run("apt-get install -y poppler-utils", shell=True)
19
 
 
20
  install_poppler()
21
 
 
22
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
23
 
 
24
  RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
25
  model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
26
  trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval()
 
28
 
29
  @spaces.GPU()
30
  def process_pdf_and_query(pdf_file, user_query):
31
+ images = convert_from_path(pdf_file.name)
 
32
  num_images = len(images)
33
 
 
34
  RAG.index(
35
  input_path=pdf_file.name,
36
+ index_name="image_index",
37
  store_collection_with_index=False,
38
  overwrite=True
39
  )
40
 
 
41
  results = RAG.search(user_query, k=1)
42
  if not results:
43
  return "No results found.", num_images
44
 
 
45
  image_index = results[0]["page_num"] - 1
46
  messages = [
47
  {
 
56
  }
57
  ]
58
 
 
59
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
60
  image_inputs, video_inputs = process_vision_info(messages)
61
  inputs = processor(
 
66
  return_tensors="pt",
67
  )
68
  inputs = inputs.to("cuda")
69
+
 
70
  generated_ids = model.generate(**inputs, max_new_tokens=50)
71
  generated_ids_trimmed = [
72
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 
77
 
78
  return output_text[0], num_images
79
 
80
+ css = """
81
+ body {
82
+ font-family: Arial, sans-serif;
83
+ background-color: #f0f0f0;
84
+ }
85
+ .container {
86
+ max-width: 800px;
87
+ margin: 0 auto;
88
+ padding: 20px;
89
+ background-color: white;
90
+ border-radius: 10px;
91
+ box-shadow: 0 0 10px rgba(0,0,0,0.1);
92
+ }
93
+ .title {
94
+ font-size: 24px;
95
+ font-weight: bold;
96
+ text-align: center;
97
+ margin-bottom: 20px;
98
+ }
99
+ .submit-btn {
100
+ background-color: #4CAF50;
101
+ color: white;
102
+ padding: 10px 20px;
103
+ border: none;
104
+ border-radius: 5px;
105
+ cursor: pointer;
106
+ font-size: 16px;
107
+ }
108
+ .submit-btn:hover {
109
+ background-color: #45a049;
110
+ }
111
+ .duplicate-button {
112
+ background-color: #4CAF50;
113
+ color: white;
114
+ padding: 10px 20px;
115
+ border: none;
116
+ border-radius: 5px;
117
+ cursor: pointer;
118
+ font-size: 16px;
119
+ margin-top: 20px;
120
+ }
121
+ """
122
+
123
+ explanation = """
124
+ <div style="background-color: #f9f9f9; padding: 15px; border-radius: 5px; margin-bottom: 20px;">
125
+ <h3>About Multimodal RAG</h3>
126
+ <p>Multimodal RAG (Retrieval-Augmented Generation) combines text and image processing to provide more context-aware responses. This demo uses:</p>
127
+ <ul>
128
+ <li><strong>ColPali</strong>: A multimodal retriever for efficient information retrieval from images and text.</li>
129
+ <li><strong>Byaldi</strong>: A new library by answer.ai that simplifies the use of ColPali.</li>
130
+ <li><strong>Qwen/Qwen2-VL-2B-Instruct</strong>: A large language model capable of processing both text and visual inputs.</li>
131
+ </ul>
132
+ <p>This combination allows for more accurate and context-aware responses to queries about uploaded PDFs.</p>
133
+ </div>
134
+ """
135
+
136
+ footer = """
137
+ <div style="text-align: center; margin-top: 20px;">
138
+ <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
139
+ <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
140
+ <a href="https://arad1367.pythonanywhere.com/" target="_blank">Live demo of my PhD defense</a> |
141
+ <a href="https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct" target="_blank">Qwen/Qwen2-VL-2B-Instruct</a> |
142
+ <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
143
+ <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
144
+ <br>
145
+ Made with πŸ’– by Pejman Ebrahimi
146
+ </div>
147
+ """
148
+
149
+ with gr.Blocks(css=css, theme='freddyaboulton/dracula_revamped') as demo:
150
+ gr.HTML('<h1 style="text-align: center; font-size: 32px;"><a href="https://github.com/arad1367" target="_blank" style="text-decoration: none; color: inherit;">Multimodal RAG with Image Query - By Pejman Ebrahimi</a></h1>')
151
+ gr.HTML(explanation)
152
+ pdf_input = gr.File(label="Upload PDF")
153
+ query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
154
+ submit_btn = gr.Button("Submit", elem_classes="submit-btn")
155
+ output_text = gr.Textbox(label="Model Answer")
156
+ output_images = gr.Textbox(label="Number of Images in PDF")
157
+
158
+ submit_btn.click(process_pdf_and_query, inputs=[pdf_input, query_input], outputs=[output_text, output_images])
159
+
160
+ gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
161
  gr.HTML(footer)
162
 
163
+ demo.launch(debug=True)