Daemontatox commited on
Commit
d65975a
·
verified ·
1 Parent(s): c19ad99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -24
app.py CHANGED
@@ -51,14 +51,14 @@ def process_pdf_file(file_path):
51
  page_text = page.get_text("text")
52
  if page_text.strip():
53
  text += f"Page {page_num + 1}:\n{page_text}\n\n"
54
-
55
  # Render page as an image with a zoom factor
56
  zoom = 3
57
  mat = fitz.Matrix(zoom, zoom)
58
  pix = page.get_pixmap(matrix=mat, alpha=False)
59
  img_data = pix.tobytes("png")
60
  img = Image.open(io.BytesIO(img_data)).convert("RGB")
61
-
62
  # Resize if image is too large
63
  max_size = 1600
64
  if max(img.size) > max_size:
@@ -83,7 +83,7 @@ def process_uploaded_file(file):
83
  doc_state.clear()
84
  if file is None:
85
  return "No file uploaded. Please upload a file."
86
-
87
  # Get the file path from the Gradio upload (may be a dict or file-like object)
88
  if isinstance(file, dict):
89
  file_path = file["name"]
@@ -91,7 +91,7 @@ def process_uploaded_file(file):
91
  file_path = file.name
92
  file_ext = file_path.lower().split('.')[-1]
93
  image_extensions = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'}
94
-
95
  if file_ext == 'pdf':
96
  doc_state.doc_type = 'pdf'
97
  try:
@@ -121,7 +121,7 @@ def process_uploaded_file(file):
121
  # -------------------------------
122
  # Bot Streaming Function Using the Multimodal API
123
  # -------------------------------
124
- def bot_streaming(prompt_option, user_message, max_new_tokens=8192):
125
  """
126
  Build a multimodal message payload and call the inference API.
127
  The payload includes:
@@ -576,15 +576,15 @@ This comprehensive system prompt provides a strong foundation for building a pow
576
  """
577
  )
578
  }
579
-
580
  # Select the appropriate prompt
581
  selected_prompt = prompts.get(prompt_option, "Invalid prompt selected.")
582
  full_prompt = selected_prompt
583
-
584
  # Append the user-provided message, if any
585
  if user_message and user_message.strip():
586
  full_prompt += "\nUser Message:\n" + user_message
587
-
588
  # Append document context if available
589
  if doc_state.current_doc_images and doc_state.current_doc_text:
590
  full_prompt += "\nDocument context:\n" + doc_state.current_doc_text
@@ -602,7 +602,7 @@ This comprehensive system prompt provides a strong foundation for building a pow
602
  ]
603
  }
604
  ]
605
-
606
  # If an image is available, encode it as a data URI and append it as an image_url message.
607
  if doc_state.current_doc_images:
608
  buffered = io.BytesIO()
@@ -614,22 +614,23 @@ This comprehensive system prompt provides a strong foundation for building a pow
614
  "type": "image_url",
615
  "image_url": {"url": data_uri}
616
  })
617
-
618
  # Call the inference API with streaming enabled.
619
  stream = client.chat.completions.create(
620
- model="google/gemini-2.0-pro-exp-02-05:free",
621
  messages=messages,
622
  max_tokens=max_new_tokens,
623
  stream=True
624
  )
625
-
626
  buffer = ""
627
  for chunk in stream:
628
  # The response structure is similar to the reference: each chunk contains a delta.
629
  delta = chunk.choices[0].delta.content
630
- buffer += delta
631
- time.sleep(0.01)
632
- yield buffer
 
633
 
634
  except Exception as e:
635
  logger.error(f"Error in bot_streaming: {str(e)}")
@@ -644,8 +645,8 @@ def clear_context():
644
  # Create the Gradio Interface
645
  # -------------------------------
646
  with gr.Blocks() as demo:
647
- gr.Markdown("# Document Analyzer with Predetermined Prompts")
648
- gr.Markdown("Upload a PDF or image (PNG, JPG, JPEG, GIF, BMP, WEBP) and select a prompt to analyze its contents.")
649
 
650
  with gr.Row():
651
  file_upload = gr.File(
@@ -655,12 +656,25 @@ with gr.Blocks() as demo:
655
  upload_status = gr.Textbox(label="Upload Status", interactive=True)
656
 
657
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
658
  prompt_dropdown = gr.Dropdown(
659
  label="Select Prompt",
660
  choices=["Default","Structured Software Tester","UserStoryCraft","APIDoc","DBModel","RiskAssess","CodeComment","RequirementCraft","DesignDoc","DiagramGen","TechWrite","UIUXReview","AccessibilityCheck","RiskAssess"],
661
  value="Default"
662
  )
663
-
664
  # Additional textbox for user messages
665
  with gr.Row():
666
  user_message_input = gr.Textbox(
@@ -668,16 +682,16 @@ with gr.Blocks() as demo:
668
  placeholder="Enter any additional instructions or context here (optional)",
669
  lines=4
670
  )
671
-
672
  with gr.Row():
673
  generate_btn = gr.Button("Generate")
674
  clear_btn = gr.Button("Clear Document Context")
675
-
676
  output_text = gr.Textbox(label="Output", interactive=False, lines=15)
677
-
678
  file_upload.change(fn=process_uploaded_file, inputs=[file_upload], outputs=[upload_status])
679
- # Pass both the prompt and the additional user message to bot_streaming
680
- generate_btn.click(fn=bot_streaming, inputs=[prompt_dropdown, user_message_input], outputs=[output_text])
681
  clear_btn.click(fn=clear_context, outputs=[upload_status])
682
 
683
- demo.launch(debug=True)
 
51
  page_text = page.get_text("text")
52
  if page_text.strip():
53
  text += f"Page {page_num + 1}:\n{page_text}\n\n"
54
+
55
  # Render page as an image with a zoom factor
56
  zoom = 3
57
  mat = fitz.Matrix(zoom, zoom)
58
  pix = page.get_pixmap(matrix=mat, alpha=False)
59
  img_data = pix.tobytes("png")
60
  img = Image.open(io.BytesIO(img_data)).convert("RGB")
61
+
62
  # Resize if image is too large
63
  max_size = 1600
64
  if max(img.size) > max_size:
 
83
  doc_state.clear()
84
  if file is None:
85
  return "No file uploaded. Please upload a file."
86
+
87
  # Get the file path from the Gradio upload (may be a dict or file-like object)
88
  if isinstance(file, dict):
89
  file_path = file["name"]
 
91
  file_path = file.name
92
  file_ext = file_path.lower().split('.')[-1]
93
  image_extensions = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'}
94
+
95
  if file_ext == 'pdf':
96
  doc_state.doc_type = 'pdf'
97
  try:
 
121
  # -------------------------------
122
  # Bot Streaming Function Using the Multimodal API
123
  # -------------------------------
124
+ def bot_streaming(model_option, prompt_option, user_message, max_new_tokens=8192):
125
  """
126
  Build a multimodal message payload and call the inference API.
127
  The payload includes:
 
576
  """
577
  )
578
  }
579
+
580
  # Select the appropriate prompt
581
  selected_prompt = prompts.get(prompt_option, "Invalid prompt selected.")
582
  full_prompt = selected_prompt
583
+
584
  # Append the user-provided message, if any
585
  if user_message and user_message.strip():
586
  full_prompt += "\nUser Message:\n" + user_message
587
+
588
  # Append document context if available
589
  if doc_state.current_doc_images and doc_state.current_doc_text:
590
  full_prompt += "\nDocument context:\n" + doc_state.current_doc_text
 
602
  ]
603
  }
604
  ]
605
+
606
  # If an image is available, encode it as a data URI and append it as an image_url message.
607
  if doc_state.current_doc_images:
608
  buffered = io.BytesIO()
 
614
  "type": "image_url",
615
  "image_url": {"url": data_uri}
616
  })
617
+
618
  # Call the inference API with streaming enabled.
619
  stream = client.chat.completions.create(
620
+ model=model_option, # Use the selected model here
621
  messages=messages,
622
  max_tokens=max_new_tokens,
623
  stream=True
624
  )
625
+
626
  buffer = ""
627
  for chunk in stream:
628
  # The response structure is similar to the reference: each chunk contains a delta.
629
  delta = chunk.choices[0].delta.content
630
+ if delta is not None: # Check if delta is not None
631
+ buffer += delta
632
+ time.sleep(0.01)
633
+ yield buffer
634
 
635
  except Exception as e:
636
  logger.error(f"Error in bot_streaming: {str(e)}")
 
645
  # Create the Gradio Interface
646
  # -------------------------------
647
  with gr.Blocks() as demo:
648
+ gr.Markdown("# Document Analyzer with Model and Prompt Selection")
649
+ gr.Markdown("Upload a PDF or image (PNG, JPG, JPEG, GIF, BMP, WEBP), select a model and a prompt to analyze its contents.")
650
 
651
  with gr.Row():
652
  file_upload = gr.File(
 
656
  upload_status = gr.Textbox(label="Upload Status", interactive=True)
657
 
658
  with gr.Row():
659
+ model_dropdown = gr.Dropdown(
660
+ label="Select Model",
661
+ choices=[
662
+ "google/gemini-2.0-pro-exp-02-05:free",
663
+ "meta-llama/llama-3.2-11b-vision-instruct:free",
664
+ "qwen/qwen-vl-plus:free",
665
+ "google/gemini-2.0-flash-lite-preview-02-05:free",
666
+ "google/gemini-2.0-flash-thinking-exp:free",
667
+ "qwen/qwen2.5-vl-72b-instruct:free"
668
+ # "openai/gpt-4-vision-preview" # Uncomment if you have access and want to include
669
+ ],
670
+ value="google/gemini-2.0-pro-exp-02-05:free" # Default model
671
+ )
672
  prompt_dropdown = gr.Dropdown(
673
  label="Select Prompt",
674
  choices=["Default","Structured Software Tester","UserStoryCraft","APIDoc","DBModel","RiskAssess","CodeComment","RequirementCraft","DesignDoc","DiagramGen","TechWrite","UIUXReview","AccessibilityCheck","RiskAssess"],
675
  value="Default"
676
  )
677
+
678
  # Additional textbox for user messages
679
  with gr.Row():
680
  user_message_input = gr.Textbox(
 
682
  placeholder="Enter any additional instructions or context here (optional)",
683
  lines=4
684
  )
685
+
686
  with gr.Row():
687
  generate_btn = gr.Button("Generate")
688
  clear_btn = gr.Button("Clear Document Context")
689
+
690
  output_text = gr.Textbox(label="Output", interactive=False, lines=15)
691
+
692
  file_upload.change(fn=process_uploaded_file, inputs=[file_upload], outputs=[upload_status])
693
+ # Pass model, prompt and user message to bot_streaming
694
+ generate_btn.click(fn=bot_streaming, inputs=[model_dropdown, prompt_dropdown, user_message_input], outputs=[output_text])
695
  clear_btn.click(fn=clear_context, outputs=[upload_status])
696
 
697
+ demo.launch(debug=True)