Nitzz4952 commited on
Commit
fa83200
·
verified ·
1 Parent(s): 380f7de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -65
app.py CHANGED
@@ -1,107 +1,56 @@
1
  import gradio as gr
2
  from PIL import Image, ImageDraw, ImageFont
3
  import scipy.io.wavfile as wavfile
4
-
5
  from transformers import pipeline
6
 
7
- narrator = pipeline("text-to-speech",
8
- model="kakao-enterprise/vits-ljs")
9
-
10
- object_detector = pipeline("object-detection",
11
- model="facebook/detr-resnet-50")
12
 
13
  def generate_audio(text):
14
  # Generate the narrated text
15
  narrated_text = narrator(text)
16
-
17
  # Save the audio to a WAV file
18
- wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
19
- data=narrated_text["audio"][0])
20
-
21
- # Return the path to the saved audio file
22
  return "output.wav"
23
 
24
  def read_objects(detection_objects):
25
- # Initialize counters for each object label
26
  object_counts = {}
27
-
28
- # Count the occurrences of each label
29
  for detection in detection_objects:
30
  label = detection['label']
31
- if label in object_counts:
32
- object_counts[label] += 1
33
- else:
34
- object_counts[label] = 1
35
-
36
- # Generate the response string
37
  response = "This picture contains"
38
  labels = list(object_counts.keys())
39
  for i, label in enumerate(labels):
40
- response += f" {object_counts[label]} {label}"
41
- if object_counts[label] > 1:
42
- response += "s"
43
  if i < len(labels) - 2:
44
  response += ","
45
  elif i == len(labels) - 2:
46
  response += " and"
47
-
48
  response += "."
49
-
50
  return response
51
 
52
-
53
-
54
  def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
55
- """
56
- Draws bounding boxes on the given image based on the detections.
57
- :param image: PIL.Image object
58
- :param detections: List of detection results, where each result is a dictionary containing
59
- 'score', 'label', and 'box' keys. 'box' itself is a dictionary with 'xmin',
60
- 'ymin', 'xmax', 'ymax'.
61
- :param font_path: Path to the TrueType font file to use for text.
62
- :param font_size: Size of the font to use for text.
63
- :return: PIL.Image object with bounding boxes drawn.
64
- """
65
- # Make a copy of the image to draw on
66
  draw_image = image.copy()
67
  draw = ImageDraw.Draw(draw_image)
68
-
69
- # Load custom font or default font if path not provided
70
  if font_path:
71
  font = ImageFont.truetype(font_path, font_size)
72
  else:
73
- # When font_path is not provided, load default font but it's size is fixed
74
  font = ImageFont.load_default()
75
- # Increase font size workaround by using a TTF font file, if needed, can download and specify the path
76
-
77
  for detection in detections:
78
  box = detection['box']
79
- xmin = box['xmin']
80
- ymin = box['ymin']
81
- xmax = box['xmax']
82
- ymax = box['ymax']
83
-
84
- # Draw the bounding box
85
  draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
86
-
87
- # Optionally, you can also draw the label and score
88
  label = detection['label']
89
  score = detection['score']
90
  text = f"{label} {score:.2f}"
91
-
92
- # Draw text with background rectangle for visibility
93
- if font_path: # Use the custom font with increased size
94
  text_size = draw.textbbox((xmin, ymin), text, font=font)
95
  else:
96
- # Calculate text size using the default font
97
  text_size = draw.textbbox((xmin, ymin), text)
98
-
99
  draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
100
  draw.text((xmin, ymin), text, fill="white", font=font)
101
-
102
  return draw_image
103
 
104
-
105
  def detect_object(image):
106
  raw_image = image
107
  output = object_detector(raw_image)
@@ -110,10 +59,20 @@ def detect_object(image):
110
  processed_audio = generate_audio(natural_text)
111
  return processed_image, processed_audio
112
 
113
-
114
- demo = gr.Interface(fn=detect_object,
115
- inputs=[gr.Image(label="Select Image",type="pil")],
116
- outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
117
- title= "Audio Described Object Detector",
118
- description="THIS APPLICATION WILL BE USED TO HIGHLIGHT OBJECTS AND GIVES AUDIO DESCRIPTION FOR THE PROVIDED INPUT IMAGE.")
119
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from PIL import Image, ImageDraw, ImageFont
3
  import scipy.io.wavfile as wavfile
 
4
  from transformers import pipeline
5
 
6
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
7
+ object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
 
 
 
8
 
9
  def generate_audio(text):
10
  # Generate the narrated text
11
  narrated_text = narrator(text)
 
12
  # Save the audio to a WAV file
13
+ wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
 
 
 
14
  return "output.wav"
15
 
16
  def read_objects(detection_objects):
 
17
  object_counts = {}
 
 
18
  for detection in detection_objects:
19
  label = detection['label']
20
+ object_counts[label] = object_counts.get(label, 0) + 1
 
 
 
 
 
21
  response = "This picture contains"
22
  labels = list(object_counts.keys())
23
  for i, label in enumerate(labels):
24
+ response += f" {object_counts[label]} {label}" + ("s" if object_counts[label] > 1 else "")
 
 
25
  if i < len(labels) - 2:
26
  response += ","
27
  elif i == len(labels) - 2:
28
  response += " and"
 
29
  response += "."
 
30
  return response
31
 
 
 
32
  def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
 
 
 
 
 
 
 
 
 
 
 
33
  draw_image = image.copy()
34
  draw = ImageDraw.Draw(draw_image)
 
 
35
  if font_path:
36
  font = ImageFont.truetype(font_path, font_size)
37
  else:
 
38
  font = ImageFont.load_default()
 
 
39
  for detection in detections:
40
  box = detection['box']
41
+ xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
 
 
 
 
 
42
  draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
 
 
43
  label = detection['label']
44
  score = detection['score']
45
  text = f"{label} {score:.2f}"
46
+ if font_path:
 
 
47
  text_size = draw.textbbox((xmin, ymin), text, font=font)
48
  else:
 
49
  text_size = draw.textbbox((xmin, ymin), text)
 
50
  draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
51
  draw.text((xmin, ymin), text, fill="white", font=font)
 
52
  return draw_image
53
 
 
54
  def detect_object(image):
55
  raw_image = image
56
  output = object_detector(raw_image)
 
59
  processed_audio = generate_audio(natural_text)
60
  return processed_image, processed_audio
61
 
62
+ examples = [
63
+ ["dogs.jpg"]
64
+ ]
65
+
66
+ demo = gr.Interface(
67
+ fn=detect_object,
68
+ inputs=[gr.Image(label="Select Image", type="pil")],
69
+ outputs=[
70
+ gr.Image(label="Processed Image", type="pil"),
71
+ gr.Audio(label="Generated Audio")
72
+ ],
73
+ title="Audio Described Object Detector",
74
+ description="This application highlights objects in the provided image and generates an audio description.",
75
+ examples=examples
76
+ )
77
+
78
+ demo.launch()