File size: 3,146 Bytes
443cd8b
 
 
 
 
bbca647
443cd8b
 
 
 
 
 
 
bbca647
 
 
 
 
 
 
443cd8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ce40d5
443cd8b
 
 
 
 
 
 
 
 
d3c6fe7
443cd8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b6cc37
 
 
 
 
 
 
 
443cd8b
 
 
 
 
 
 
 
 
 
 
 
 
5ce40d5
443cd8b
7b6cc37
443cd8b
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import gradio as gr
import torch
from ultralyticsplus import YOLO
import numpy as np

from PIL import Image
from base64 import b64encode
from io import BytesIO
from gtts import gTTS
from mtranslate import translate
from speech_recognition import AudioFile, Recognizer
import time

from sahi.prediction import ObjectPrediction, PredictionScore
from sahi.utils.cv import (
    get_bool_mask_from_coco_segmentation,
    read_image_as_pil,
    visualize_object_predictions,
)

model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names

def tts(text: str, language="ja") -> object:
    """Converts text into autoplay html.
    Args:
        text (str): generated answer of bot
    Returns:
        html: autoplay object
    """
    tts_object = gTTS(text=text, lang=language, slow=False)
    bytes_object = BytesIO()
    tts_object.write_to_fp(bytes_object)
    bytes_object.seek(0)
    b64 = b64encode(bytes_object.getvalue()).decode()
    html = f"""
    <audio controls autoplay>
    <source src="data:audio/wav;base64,{b64}" type="audio/wav">
    </audio>
    """
    return html


def yolov8_inference(
    image,
    area_thres=0.35,
    defaul_bot_voice="おはいようございます"
):
    """
    YOLOv8 inference function
    Args:
        image: Input image
    Returns:
        Rendered image
    """
    # time.sleep(1)
    # set model parameters
    model.overrides['conf'] = 0.25  # NMS confidence threshold
    model.overrides['iou'] = 0.45  # NMS IoU threshold
    model.overrides['agnostic_nms'] = False  # NMS class-agnostic
    model.overrides['max_det'] = 1000  # maximum number of detections per image
    results = model.predict(image, show=False)[0]
    image = read_image_as_pil(image)
    np_image = np.ascontiguousarray(image)
    masks, boxes = results.masks, results.boxes
    area_image = image.width*image.height
    object_predictions = []
    html_bot_voice = ""
    if boxes is not None:
        det_ind = 0
        for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
            if int(cls) != 0:
                continue
            box = xyxy.tolist()
            area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
            object_prediction = ObjectPrediction(
                bbox=box,
                category_name=CLASS[int(cls)],
                category_id=int(cls),
                score=area_rate,
            )
            object_predictions.append(object_prediction)
            det_ind += 1
            if area_rate >= area_thres:
                html_bot_voice = tts(defaul_bot_voice, language="ja")
                
    result = visualize_object_predictions(
        image=np_image,
        object_prediction_list=object_predictions,
        rect_th=2,
        text_th=2,
    )
    
    return Image.fromarray(result["image"]), html_bot_voice


outputs = [gr.Image(type="filepath", label="Robot View"),
           gr.HTML()]
title = "Detomo Aisatsu Robot"

demo_app = gr.Interface(
    fn=yolov8_inference,
    inputs=gr.Image(source="webcam", streaming=True, label="Input Image"),
    outputs=outputs,
    title=title,
    live=True,
)
demo_app.launch(debug=True)