arad1367 commited on
Commit
f9c5a74
β€’
1 Parent(s): 7b52b89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -94
app.py CHANGED
@@ -1,94 +1,90 @@
1
- import spaces
2
- import gradio as gr
3
- from pdf2image import convert_from_path
4
- from byaldi import RAGMultiModalModel
5
- from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
6
- from qwen_vl_utils import process_vision_info
7
- import torch
8
- import subprocess
9
-
10
- # Install flash-attn if not already installed
11
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
12
-
13
- # try:
14
- # subprocess.check_output(['dpkg', '-s', 'poppler-utils'])
15
- # except subprocess.CalledProcessError:
16
- # print("Error: poppler-utils is not installed. Installing...")
17
- # subprocess.check_call(['sudo', 'apt-get', 'install', '-y', 'poppler-utils'])
18
-
19
- # Load the RAG Model and the Qwen2-VL-2B-Instruct model
20
- RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
21
- model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
22
- trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval()
23
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)
24
-
25
- @spaces.GPU()
26
- def process_pdf_and_query(pdf_file, user_query):
27
- # Convert the PDF to images
28
- images = convert_from_path(pdf_file.name) # pdf_file.name gives the file path
29
- num_images = len(images)
30
-
31
- # Indexing the PDF in RAG
32
- RAG.index(
33
- input_path=pdf_file.name,
34
- index_name="image_index", # index will be saved at index_root/index_name/
35
- store_collection_with_index=False,
36
- overwrite=True
37
- )
38
-
39
- # Search the query in the RAG model
40
- results = RAG.search(user_query, k=1)
41
- if not results:
42
- return "No results found.", num_images
43
-
44
- # Retrieve the page number and process image
45
- image_index = results[0]["page_num"] - 1
46
- messages = [
47
- {
48
- "role": "user",
49
- "content": [
50
- {
51
- "type": "image",
52
- "image": images[image_index],
53
- },
54
- {"type": "text", "text": user_query},
55
- ],
56
- }
57
- ]
58
-
59
- # Generate text with the Qwen model
60
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
61
- image_inputs, video_inputs = process_vision_info(messages)
62
- inputs = processor(
63
- text=[text],
64
- images=image_inputs,
65
- videos=video_inputs,
66
- padding=True,
67
- return_tensors="pt",
68
- )
69
- inputs = inputs.to("cuda")
70
-
71
- # Generate the output response
72
- generated_ids = model.generate(**inputs, max_new_tokens=50)
73
- generated_ids_trimmed = [
74
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
75
- ]
76
- output_text = processor.batch_decode(
77
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
78
- )
79
-
80
- return output_text[0], num_images
81
-
82
- # Define the Gradio Interface
83
- pdf_input = gr.inputs.File(label="Upload PDF", type="file")
84
- query_input = gr.inputs.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
85
- output_text = gr.outputs.Textbox(label="Model Answer")
86
- output_images = gr.outputs.Textbox(label="Number of Images in PDF")
87
-
88
- # Launch the Gradio app
89
- gr.Interface(
90
- fn=process_pdf_and_query,
91
- inputs=[pdf_input, query_input],
92
- outputs=[output_text, output_images],
93
- title="Multimodal RAG with Image Query - By Pejman Ebrahimi"
94
- ).launch()
 
1
+ import spaces
2
+ import gradio as gr
3
+ from pdf2image import convert_from_path
4
+ from byaldi import RAGMultiModalModel
5
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
6
+ from qwen_vl_utils import process_vision_info
7
+ import torch
8
+ import subprocess
9
+
10
+ # Install flash-attn if not already installed
11
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
12
+
13
+ # Load the RAG Model and the Qwen2-VL-2B-Instruct model
14
+ RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
15
+ model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
16
+ trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval()
17
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)
18
+
19
+ @spaces.GPU()
20
+ def process_pdf_and_query(pdf_file, user_query):
21
+ # Convert the PDF to images
22
+ images = convert_from_path(pdf_file.name) # pdf_file.name gives the file path
23
+ num_images = len(images)
24
+
25
+ # Indexing the PDF in RAG
26
+ RAG.index(
27
+ input_path=pdf_file.name,
28
+ index_name="image_index", # index will be saved at index_root/index_name/
29
+ store_collection_with_index=False,
30
+ overwrite=True
31
+ )
32
+
33
+ # Search the query in the RAG model
34
+ results = RAG.search(user_query, k=1)
35
+ if not results:
36
+ return "No results found.", num_images
37
+
38
+ # Retrieve the page number and process image
39
+ image_index = results[0]["page_num"] - 1
40
+ messages = [
41
+ {
42
+ "role": "user",
43
+ "content": [
44
+ {
45
+ "type": "image",
46
+ "image": images[image_index],
47
+ },
48
+ {"type": "text", "text": user_query},
49
+ ],
50
+ }
51
+ ]
52
+
53
+ # Generate text with the Qwen model
54
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
55
+ image_inputs, video_inputs = process_vision_info(messages)
56
+ inputs = processor(
57
+ text=[text],
58
+ images=image_inputs,
59
+ videos=video_inputs,
60
+ padding=True,
61
+ return_tensors="pt",
62
+ )
63
+ inputs = inputs.to("cuda")
64
+
65
+ # Generate the output response
66
+ generated_ids = model.generate(**inputs, max_new_tokens=50)
67
+ generated_ids_trimmed = [
68
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
69
+ ]
70
+ output_text = processor.batch_decode(
71
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
72
+ )
73
+
74
+ return output_text[0], num_images
75
+
76
+ # Define the Gradio Interface
77
+ pdf_input = gr.File(label="Upload PDF") # Single PDF file input
78
+ query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF") # User query input
79
+ output_text = gr.Textbox(label="Model Answer") # Output for the model's answer
80
+ output_images = gr.Textbox(label="Number of Images in PDF") # Output for number of images
81
+
82
+ # Launch the Gradio app
83
+ demo = gr.Interface(
84
+ fn=process_pdf_and_query,
85
+ inputs=[pdf_input, query_input], # List of inputs
86
+ outputs=[output_text, output_images], # List of outputs
87
+ title="Multimodal RAG with Image Query - By Pejman Ebrahimi"
88
+ )
89
+
90
+ demo.launch(debug=True) # Start the interface