Vision_tester

Running

App Files Files Community

Daemontatox commited on 15 days ago

Commit

d65975a

verified ·

1 Parent(s): c19ad99

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -24

app.py CHANGED Viewed

@@ -51,14 +51,14 @@ def process_pdf_file(file_path):
                 page_text = page.get_text("text")
                 if page_text.strip():
                     text += f"Page {page_num + 1}:\n{page_text}\n\n"
                 # Render page as an image with a zoom factor
                 zoom = 3
                 mat = fitz.Matrix(zoom, zoom)
                 pix = page.get_pixmap(matrix=mat, alpha=False)
                 img_data = pix.tobytes("png")
                 img = Image.open(io.BytesIO(img_data)).convert("RGB")
                 # Resize if image is too large
                 max_size = 1600
                 if max(img.size) > max_size:
@@ -83,7 +83,7 @@ def process_uploaded_file(file):
         doc_state.clear()
         if file is None:
             return "No file uploaded. Please upload a file."
         # Get the file path from the Gradio upload (may be a dict or file-like object)
         if isinstance(file, dict):
             file_path = file["name"]
@@ -91,7 +91,7 @@ def process_uploaded_file(file):
             file_path = file.name
         file_ext = file_path.lower().split('.')[-1]
         image_extensions = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'}
         if file_ext == 'pdf':
             doc_state.doc_type = 'pdf'
             try:
@@ -121,7 +121,7 @@ def process_uploaded_file(file):
 # -------------------------------
 # Bot Streaming Function Using the Multimodal API
 # -------------------------------
-def bot_streaming(prompt_option, user_message, max_new_tokens=8192):
     """
     Build a multimodal message payload and call the inference API.
     The payload includes:
@@ -576,15 +576,15 @@ This comprehensive system prompt provides a strong foundation for building a pow
                 """
             )
         }
         # Select the appropriate prompt
         selected_prompt = prompts.get(prompt_option, "Invalid prompt selected.")
         full_prompt = selected_prompt
         # Append the user-provided message, if any
         if user_message and user_message.strip():
             full_prompt += "\nUser Message:\n" + user_message
         # Append document context if available
         if doc_state.current_doc_images and doc_state.current_doc_text:
             full_prompt += "\nDocument context:\n" + doc_state.current_doc_text
@@ -602,7 +602,7 @@ This comprehensive system prompt provides a strong foundation for building a pow
                 ]
             }
         ]
         # If an image is available, encode it as a data URI and append it as an image_url message.
         if doc_state.current_doc_images:
             buffered = io.BytesIO()
@@ -614,22 +614,23 @@ This comprehensive system prompt provides a strong foundation for building a pow
                 "type": "image_url",
                 "image_url": {"url": data_uri}
             })
         # Call the inference API with streaming enabled.
         stream = client.chat.completions.create(
-            model="google/gemini-2.0-pro-exp-02-05:free",
             messages=messages,
             max_tokens=max_new_tokens,
             stream=True
         )
         buffer = ""
         for chunk in stream:
             # The response structure is similar to the reference: each chunk contains a delta.
             delta = chunk.choices[0].delta.content
-            buffer += delta
-            time.sleep(0.01)
-            yield buffer
     except Exception as e:
         logger.error(f"Error in bot_streaming: {str(e)}")
@@ -644,8 +645,8 @@ def clear_context():
 # Create the Gradio Interface
 # -------------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# Document Analyzer with Predetermined Prompts")
-    gr.Markdown("Upload a PDF or image (PNG, JPG, JPEG, GIF, BMP, WEBP) and select a prompt to analyze its contents.")
     with gr.Row():
         file_upload = gr.File(
@@ -655,12 +656,25 @@ with gr.Blocks() as demo:
         upload_status = gr.Textbox(label="Upload Status", interactive=True)
     with gr.Row():
         prompt_dropdown = gr.Dropdown(
             label="Select Prompt",
             choices=["Default","Structured Software Tester","UserStoryCraft","APIDoc","DBModel","RiskAssess","CodeComment","RequirementCraft","DesignDoc","DiagramGen","TechWrite","UIUXReview","AccessibilityCheck","RiskAssess"],
             value="Default"
         )
     # Additional textbox for user messages
     with gr.Row():
         user_message_input = gr.Textbox(
@@ -668,16 +682,16 @@ with gr.Blocks() as demo:
             placeholder="Enter any additional instructions or context here (optional)",
             lines=4
         )
     with gr.Row():
         generate_btn = gr.Button("Generate")
         clear_btn = gr.Button("Clear Document Context")
     output_text = gr.Textbox(label="Output", interactive=False, lines=15)
     file_upload.change(fn=process_uploaded_file, inputs=[file_upload], outputs=[upload_status])
-    # Pass both the prompt and the additional user message to bot_streaming
-    generate_btn.click(fn=bot_streaming, inputs=[prompt_dropdown, user_message_input], outputs=[output_text])
     clear_btn.click(fn=clear_context, outputs=[upload_status])
-demo.launch(debug=True)

                 page_text = page.get_text("text")
                 if page_text.strip():
                     text += f"Page {page_num + 1}:\n{page_text}\n\n"
                 # Render page as an image with a zoom factor
                 zoom = 3
                 mat = fitz.Matrix(zoom, zoom)
                 pix = page.get_pixmap(matrix=mat, alpha=False)
                 img_data = pix.tobytes("png")
                 img = Image.open(io.BytesIO(img_data)).convert("RGB")
                 # Resize if image is too large
                 max_size = 1600
                 if max(img.size) > max_size:
         doc_state.clear()
         if file is None:
             return "No file uploaded. Please upload a file."
         # Get the file path from the Gradio upload (may be a dict or file-like object)
         if isinstance(file, dict):
             file_path = file["name"]
             file_path = file.name
         file_ext = file_path.lower().split('.')[-1]
         image_extensions = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'}
         if file_ext == 'pdf':
             doc_state.doc_type = 'pdf'
             try:
 # -------------------------------
 # Bot Streaming Function Using the Multimodal API
 # -------------------------------
+def bot_streaming(model_option, prompt_option, user_message, max_new_tokens=8192):
     """
     Build a multimodal message payload and call the inference API.
     The payload includes:
                 """
             )
         }
         # Select the appropriate prompt
         selected_prompt = prompts.get(prompt_option, "Invalid prompt selected.")
         full_prompt = selected_prompt
         # Append the user-provided message, if any
         if user_message and user_message.strip():
             full_prompt += "\nUser Message:\n" + user_message
         # Append document context if available
         if doc_state.current_doc_images and doc_state.current_doc_text:
             full_prompt += "\nDocument context:\n" + doc_state.current_doc_text
                 ]
             }
         ]
         # If an image is available, encode it as a data URI and append it as an image_url message.
         if doc_state.current_doc_images:
             buffered = io.BytesIO()
                 "type": "image_url",
                 "image_url": {"url": data_uri}
             })
         # Call the inference API with streaming enabled.
         stream = client.chat.completions.create(
+            model=model_option, # Use the selected model here
             messages=messages,
             max_tokens=max_new_tokens,
             stream=True
         )
         buffer = ""
         for chunk in stream:
             # The response structure is similar to the reference: each chunk contains a delta.
             delta = chunk.choices[0].delta.content
+            if delta is not None: # Check if delta is not None
+                buffer += delta
+                time.sleep(0.01)
+                yield buffer
     except Exception as e:
         logger.error(f"Error in bot_streaming: {str(e)}")
 # Create the Gradio Interface
 # -------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# Document Analyzer with Model and Prompt Selection")
+    gr.Markdown("Upload a PDF or image (PNG, JPG, JPEG, GIF, BMP, WEBP), select a model and a prompt to analyze its contents.")
     with gr.Row():
         file_upload = gr.File(
         upload_status = gr.Textbox(label="Upload Status", interactive=True)
     with gr.Row():
+        model_dropdown = gr.Dropdown(
+            label="Select Model",
+            choices=[
+                "google/gemini-2.0-pro-exp-02-05:free",
+                "meta-llama/llama-3.2-11b-vision-instruct:free",
+                "qwen/qwen-vl-plus:free",
+                "google/gemini-2.0-flash-lite-preview-02-05:free",
+                "google/gemini-2.0-flash-thinking-exp:free",
+                "qwen/qwen2.5-vl-72b-instruct:free"
+                # "openai/gpt-4-vision-preview" # Uncomment if you have access and want to include
+            ],
+            value="google/gemini-2.0-pro-exp-02-05:free" # Default model
+        )
         prompt_dropdown = gr.Dropdown(
             label="Select Prompt",
             choices=["Default","Structured Software Tester","UserStoryCraft","APIDoc","DBModel","RiskAssess","CodeComment","RequirementCraft","DesignDoc","DiagramGen","TechWrite","UIUXReview","AccessibilityCheck","RiskAssess"],
             value="Default"
         )
     # Additional textbox for user messages
     with gr.Row():
         user_message_input = gr.Textbox(
             placeholder="Enter any additional instructions or context here (optional)",
             lines=4
         )
     with gr.Row():
         generate_btn = gr.Button("Generate")
         clear_btn = gr.Button("Clear Document Context")
     output_text = gr.Textbox(label="Output", interactive=False, lines=15)
     file_upload.change(fn=process_uploaded_file, inputs=[file_upload], outputs=[upload_status])
+    # Pass model, prompt and user message to bot_streaming
+    generate_btn.click(fn=bot_streaming, inputs=[model_dropdown, prompt_dropdown, user_message_input], outputs=[output_text])
     clear_btn.click(fn=clear_context, outputs=[upload_status])
+demo.launch(debug=True)