guneetsk99
/

finance_qwen_VL_7B

Image-Text-to-Text

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

guneetsk99 commited on Dec 18, 2024

Commit

3d6b1e3

·

verified ·

1 Parent(s): 7a6263a

Update app.py

Files changed (1) hide show

app.py +16 -11

app.py CHANGED Viewed

@@ -1,33 +1,38 @@
 import gradio as gr
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from PIL import Image
-# Load model and processor
 processor = AutoProcessor.from_pretrained("guneetsk99/finance_qwen_VL_7B")
 model = AutoModelForImageTextToText.from_pretrained("guneetsk99/finance_qwen_VL_7B")
-def predict(input_img):
-    # Preprocess the image
-    inputs = processor(images=input_img, return_tensors="pt")
     # Generate predictions using the model
-    outputs = model.generate(**inputs)
     # Decode the generated text
     generated_text = processor.decode(outputs[0], skip_special_tokens=True)
-    # Return the input image and the generated text
-    return input_img, {"Prediction": generated_text}
 # Create the Gradio interface
 gradio_app = gr.Interface(
-    predict,
-    inputs=gr.Image(label="Upload Image", source="upload", type="pil"),
     outputs=[
         gr.Image(label="Uploaded Image"),
-        gr.Label(label="Generated Text"),
     ],
-    title="Image to Text Model",
 )
 if __name__ == "__main__":

 import gradio as gr
 from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch
 from PIL import Image
+# Load the processor and model
 processor = AutoProcessor.from_pretrained("guneetsk99/finance_qwen_VL_7B")
 model = AutoModelForImageTextToText.from_pretrained("guneetsk99/finance_qwen_VL_7B")
+def predict(input_img, text_prompt):
+    # Preprocess the image and text prompt
+    inputs = processor(images=input_img, text=text_prompt, return_tensors="pt").to(model.device)
     # Generate predictions using the model
+    with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=50)
     # Decode the generated text
     generated_text = processor.decode(outputs[0], skip_special_tokens=True)
+    return input_img, generated_text
 # Create the Gradio interface
 gradio_app = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Image(label="Upload Image", source="upload", type="pil"),
+        gr.Textbox(label="Text Prompt", placeholder="Enter a text prompt, e.g., 'Describe this image.'"),
+    ],
     outputs=[
         gr.Image(label="Uploaded Image"),
+        gr.Textbox(label="Generated Response"),
     ],
+    title="Finance Image-to-Text Model",
+    description="Upload a financial document image and provide a text prompt for the model to process the image and generate a text response.",
 )
 if __name__ == "__main__":