HuggingGPT / models_server.py
tricktreat's picture
text to video
5471e91
raw
history blame
29.7 kB
import argparse
import logging
import random
import uuid
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from diffusers.utils import load_image
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import export_to_video
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from datasets import load_dataset
from PIL import Image
import io
from torchvision import transforms
import torch
import torchaudio
from speechbrain.pretrained import WaveformEnhancement
import joblib
from huggingface_hub import hf_hub_url, cached_download
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, AutoFeatureExtractor
from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector
from controlnet_aux.open_pose.body import Body
from controlnet_aux.mlsd.models.mbv2_mlsd_large import MobileV2_MLSD_Large
from controlnet_aux.hed import Network
from transformers import DPTForDepthEstimation, DPTFeatureExtractor
import warnings
import time
from espnet2.bin.tts_inference import Text2Speech
import soundfile as sf
from asteroid.models import BaseModel
import traceback
import os
import yaml
warnings.filterwarnings("ignore")
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default="config.yaml")
args = parser.parse_args()
if __name__ != "__main__":
args.config = "config.gradio.yaml"
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
local_deployment = config["local_deployment"]
if config["inference_mode"] == "huggingface":
local_deployment = "none"
PROXY = None
if config["proxy"]:
PROXY = {
"https": config["proxy"],
}
start = time.time()
# local_models = "models/"
local_models = ""
def load_pipes(local_deployment):
other_pipes = {}
standard_pipes = {}
controlnet_sd_pipes = {}
if local_deployment in ["full"]:
other_pipes = {
"nlpconnect/vit-gpt2-image-captioning":{
"model": VisionEncoderDecoderModel.from_pretrained(f"{local_models}nlpconnect/vit-gpt2-image-captioning"),
"feature_extractor": ViTImageProcessor.from_pretrained(f"{local_models}nlpconnect/vit-gpt2-image-captioning"),
"tokenizer": AutoTokenizer.from_pretrained(f"{local_models}nlpconnect/vit-gpt2-image-captioning"),
"device": "cuda:0"
},
# "Salesforce/blip-image-captioning-large": {
# "model": BlipForConditionalGeneration.from_pretrained(f"Salesforce/blip-image-captioning-large"),
# "processor": BlipProcessor.from_pretrained(f"Salesforce/blip-image-captioning-large"),
# "device": "cuda:0"
# },
"damo-vilab/text-to-video-ms-1.7b": {
"model": DiffusionPipeline.from_pretrained(f"{local_models}damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"),
"device": "cuda:0"
},
# "facebook/maskformer-swin-large-ade": {
# "model": MaskFormerForInstanceSegmentation.from_pretrained(f"facebook/maskformer-swin-large-ade"),
# "feature_extractor" : AutoFeatureExtractor.from_pretrained("facebook/maskformer-swin-large-ade"),
# "device": "cuda:0"
# },
# "microsoft/trocr-base-printed": {
# "processor": TrOCRProcessor.from_pretrained(f"microsoft/trocr-base-printed"),
# "model": VisionEncoderDecoderModel.from_pretrained(f"microsoft/trocr-base-printed"),
# "device": "cuda:0"
# },
# "microsoft/trocr-base-handwritten": {
# "processor": TrOCRProcessor.from_pretrained(f"microsoft/trocr-base-handwritten"),
# "model": VisionEncoderDecoderModel.from_pretrained(f"microsoft/trocr-base-handwritten"),
# "device": "cuda:0"
# },
"JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": {
"model": BaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"),
"device": "cuda:0"
},
"espnet/kan-bayashi_ljspeech_vits": {
"model": Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits"),
"device": "cuda:0"
},
"lambdalabs/sd-image-variations-diffusers": {
"model": DiffusionPipeline.from_pretrained(f"{local_models}lambdalabs/sd-image-variations-diffusers"), #torch_dtype=torch.float16
"device": "cuda:0"
},
# "CompVis/stable-diffusion-v1-4": {
# "model": DiffusionPipeline.from_pretrained(f"CompVis/stable-diffusion-v1-4"),
# "device": "cuda:0"
# },
# "stabilityai/stable-diffusion-2-1": {
# "model": DiffusionPipeline.from_pretrained(f"stabilityai/stable-diffusion-2-1"),
# "device": "cuda:0"
# },
"runwayml/stable-diffusion-v1-5": {
"model": DiffusionPipeline.from_pretrained(f"{local_models}runwayml/stable-diffusion-v1-5"),
"device": "cuda:0"
},
# "microsoft/speecht5_tts":{
# "processor": SpeechT5Processor.from_pretrained(f"microsoft/speecht5_tts"),
# "model": SpeechT5ForTextToSpeech.from_pretrained(f"microsoft/speecht5_tts"),
# "vocoder": SpeechT5HifiGan.from_pretrained(f"microsoft/speecht5_hifigan"),
# "embeddings_dataset": load_dataset(f"Matthijs/cmu-arctic-xvectors", split="validation"),
# "device": "cuda:0"
# },
# "speechbrain/mtl-mimic-voicebank": {
# "model": WaveformEnhancement.from_hparams(source="speechbrain/mtl-mimic-voicebank", savedir="models/mtl-mimic-voicebank"),
# "device": "cuda:0"
# },
"microsoft/speecht5_vc":{
"processor": SpeechT5Processor.from_pretrained(f"{local_models}microsoft/speecht5_vc"),
"model": SpeechT5ForSpeechToSpeech.from_pretrained(f"{local_models}microsoft/speecht5_vc"),
"vocoder": SpeechT5HifiGan.from_pretrained(f"{local_models}microsoft/speecht5_hifigan"),
"embeddings_dataset": load_dataset(f"{local_models}Matthijs/cmu-arctic-xvectors", split="validation"),
"device": "cuda:0"
},
# "julien-c/wine-quality": {
# "model": joblib.load(cached_download(hf_hub_url("julien-c/wine-quality", "sklearn_model.joblib")))
# },
# "facebook/timesformer-base-finetuned-k400": {
# "processor": AutoImageProcessor.from_pretrained(f"facebook/timesformer-base-finetuned-k400"),
# "model": TimesformerForVideoClassification.from_pretrained(f"facebook/timesformer-base-finetuned-k400"),
# "device": "cuda:0"
# },
"facebook/maskformer-swin-base-coco": {
"feature_extractor": MaskFormerFeatureExtractor.from_pretrained(f"{local_models}facebook/maskformer-swin-base-coco"),
"model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_models}facebook/maskformer-swin-base-coco"),
"device": "cuda:0"
},
"Intel/dpt-hybrid-midas": {
"model": DPTForDepthEstimation.from_pretrained(f"{local_models}Intel/dpt-hybrid-midas", low_cpu_mem_usage=True),
"feature_extractor": DPTFeatureExtractor.from_pretrained(f"{local_models}Intel/dpt-hybrid-midas"),
"device": "cuda:0"
}
}
if local_deployment in ["full", "standard"]:
standard_pipes = {
# "superb/wav2vec2-base-superb-ks": {
# "model": pipeline(task="audio-classification", model=f"superb/wav2vec2-base-superb-ks"),
# "device": "cuda:0"
# },
"openai/whisper-base": {
"model": pipeline(task="automatic-speech-recognition", model=f"{local_models}openai/whisper-base"),
"device": "cuda:0"
},
"microsoft/speecht5_asr": {
"model": pipeline(task="automatic-speech-recognition", model=f"{local_models}microsoft/speecht5_asr"),
"device": "cuda:0"
},
"Intel/dpt-large": {
"model": pipeline(task="depth-estimation", model=f"{local_models}Intel/dpt-large"),
"device": "cuda:0"
},
# "microsoft/beit-base-patch16-224-pt22k-ft22k": {
# "model": pipeline(task="image-classification", model=f"microsoft/beit-base-patch16-224-pt22k-ft22k"),
# "device": "cuda:0"
# },
"facebook/detr-resnet-50-panoptic": {
"model": pipeline(task="image-segmentation", model=f"{local_models}facebook/detr-resnet-50-panoptic"),
"device": "cuda:0"
},
"facebook/detr-resnet-101": {
"model": pipeline(task="object-detection", model=f"{local_models}facebook/detr-resnet-101"),
"device": "cuda:0"
},
# "openai/clip-vit-large-patch14": {
# "model": pipeline(task="zero-shot-image-classification", model=f"openai/clip-vit-large-patch14"),
# "device": "cuda:0"
# },
"google/owlvit-base-patch32": {
"model": pipeline(task="zero-shot-object-detection", model=f"{local_models}google/owlvit-base-patch32"),
"device": "cuda:0"
},
# "microsoft/DialoGPT-medium": {
# "model": pipeline(task="conversational", model=f"microsoft/DialoGPT-medium"),
# "device": "cuda:0"
# },
# "bert-base-uncased": {
# "model": pipeline(task="fill-mask", model=f"bert-base-uncased"),
# "device": "cuda:0"
# },
# "deepset/roberta-base-squad2": {
# "model": pipeline(task = "question-answering", model=f"deepset/roberta-base-squad2"),
# "device": "cuda:0"
# },
# "facebook/bart-large-cnn": {
# "model": pipeline(task="summarization", model=f"facebook/bart-large-cnn"),
# "device": "cuda:0"
# },
# "google/tapas-base-finetuned-wtq": {
# "model": pipeline(task="table-question-answering", model=f"google/tapas-base-finetuned-wtq"),
# "device": "cuda:0"
# },
# "distilbert-base-uncased-finetuned-sst-2-english": {
# "model": pipeline(task="text-classification", model=f"distilbert-base-uncased-finetuned-sst-2-english"),
# "device": "cuda:0"
# },
# "gpt2": {
# "model": pipeline(task="text-generation", model="gpt2"),
# "device": "cuda:0"
# },
# "mrm8488/t5-base-finetuned-question-generation-ap": {
# "model": pipeline(task="text2text-generation", model=f"mrm8488/t5-base-finetuned-question-generation-ap"),
# "device": "cuda:0"
# },
# "Jean-Baptiste/camembert-ner": {
# "model": pipeline(task="token-classification", model=f"Jean-Baptiste/camembert-ner", aggregation_strategy="simple"),
# "device": "cuda:0"
# },
# "t5-base": {
# "model": pipeline(task="translation", model=f"t5-base"),
# "device": "cuda:0"
# },
"impira/layoutlm-document-qa": {
"model": pipeline(task="document-question-answering", model=f"{local_models}impira/layoutlm-document-qa"),
"device": "cuda:0"
},
"ydshieh/vit-gpt2-coco-en": {
"model": pipeline(task="image-to-text", model=f"{local_models}ydshieh/vit-gpt2-coco-en"),
"device": "cuda:0"
},
"dandelin/vilt-b32-finetuned-vqa": {
"model": pipeline(task="visual-question-answering", model=f"{local_models}dandelin/vilt-b32-finetuned-vqa"),
"device": "cuda:0"
}
}
if local_deployment in ["full", "standard", "minimal"]:
controlnet = ControlNetModel.from_pretrained(f"{local_models}lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
controlnetpipe = StableDiffusionControlNetPipeline.from_pretrained(
f"{local_models}runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
)
hed_network = HEDdetector.from_pretrained('lllyasviel/ControlNet')
controlnet_sd_pipes = {
"openpose-control": {
"model": OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
},
"mlsd-control": {
"model": MLSDdetector.from_pretrained('lllyasviel/ControlNet')
},
"hed-control": {
"model": hed_network
},
"scribble-control": {
"model": hed_network
},
"midas-control": {
"model": MidasDetector.from_pretrained('lllyasviel/ControlNet')
},
"canny-control": {
"model": CannyDetector()
},
"lllyasviel/sd-controlnet-canny":{
"control": controlnet,
"model": controlnetpipe,
"device": "cuda:0"
},
"lllyasviel/sd-controlnet-depth":{
"control": ControlNetModel.from_pretrained(f"{local_models}lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": "cuda:0"
},
"lllyasviel/sd-controlnet-hed":{
"control": ControlNetModel.from_pretrained(f"{local_models}lllyasviel/sd-controlnet-hed", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": "cuda:0"
},
"lllyasviel/sd-controlnet-mlsd":{
"control": ControlNetModel.from_pretrained(f"{local_models}lllyasviel/sd-controlnet-mlsd", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": "cuda:0"
},
"lllyasviel/sd-controlnet-openpose":{
"control": ControlNetModel.from_pretrained(f"{local_models}lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": "cuda:0"
},
"lllyasviel/sd-controlnet-scribble":{
"control": ControlNetModel.from_pretrained(f"{local_models}lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": "cuda:0"
},
"lllyasviel/sd-controlnet-seg":{
"control": ControlNetModel.from_pretrained(f"{local_models}lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16),
"model": controlnetpipe,
"device": "cuda:0"
}
}
pipes = {**standard_pipes, **other_pipes, **controlnet_sd_pipes}
return pipes
pipes = load_pipes(local_deployment)
end = time.time()
during = end - start
print(f"[ ready ] {during}s")
def running():
return {"running": True}
def status(model_id):
disabled_models = ["microsoft/trocr-base-printed", "microsoft/trocr-base-handwritten"]
if model_id in pipes.keys() and model_id not in disabled_models:
print(f"[ check {model_id} ] success")
return {"loaded": True}
else:
print(f"[ check {model_id} ] failed")
return {"loaded": False}
def models(model_id, data):
while "using" in pipes[model_id] and pipes[model_id]["using"]:
print(f"[ inference {model_id} ] waiting")
time.sleep(0.1)
pipes[model_id]["using"] = True
print(f"[ inference {model_id} ] start")
start = time.time()
pipe = pipes[model_id]["model"]
if "device" in pipes[model_id]:
try:
pipe.to(pipes[model_id]["device"])
except:
pipe.device = torch.device(pipes[model_id]["device"])
pipe.model.to(pipes[model_id]["device"])
result = None
try:
# text to video
if model_id == "damo-vilab/text-to-video-ms-1.7b":
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
# pipe.enable_model_cpu_offload()
prompt = data["text"]
video_frames = pipe(prompt, num_inference_steps=50, num_frames=40).frames
video_path = export_to_video(video_frames)
file_name = str(uuid.uuid4())[:4]
os.system(f"LD_LIBRARY_PATH=/usr/local/lib /usr/local/bin/ffmpeg -i {video_path} -vcodec libx264 public/videos/{file_name}.mp4")
result = {"path": f"/videos/{file_name}.mp4"}
# controlnet
if model_id.startswith("lllyasviel/sd-controlnet-"):
pipe.controlnet.to('cpu')
pipe.controlnet = pipes[model_id]["control"].to(pipes[model_id]["device"])
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
control_image = load_image(data["img_url"])
# generator = torch.manual_seed(66)
out_image: Image = pipe(data["text"], num_inference_steps=20, image=control_image).images[0]
file_name = str(uuid.uuid4())[:4]
out_image.save(f"public/images/{file_name}.png")
result = {"path": f"/images/{file_name}.png"}
if model_id.endswith("-control"):
image = load_image(data["img_url"])
if "scribble" in model_id:
control = pipe(image, scribble = True)
elif "canny" in model_id:
control = pipe(image, low_threshold=100, high_threshold=200)
else:
control = pipe(image)
file_name = str(uuid.uuid4())[:4]
control.save(f"public/images/{file_name}.png")
result = {"path": f"/images/{file_name}.png"}
# image to image
if model_id == "lambdalabs/sd-image-variations-diffusers":
im = load_image(data["img_url"])
file_name = str(uuid.uuid4())[:4]
with open(f"public/images/{file_name}.png", "wb") as f:
f.write(data)
tform = transforms.Compose([
transforms.ToTensor(),
transforms.Resize(
(224, 224),
interpolation=transforms.InterpolationMode.BICUBIC,
antialias=False,
),
transforms.Normalize(
[0.48145466, 0.4578275, 0.40821073],
[0.26862954, 0.26130258, 0.27577711]),
])
inp = tform(im).to(pipes[model_id]["device"]).unsqueeze(0)
out = pipe(inp, guidance_scale=3)
out["images"][0].save(f"public/images/{file_name}.jpg")
result = {"path": f"/images/{file_name}.jpg"}
# image to text
if model_id == "Salesforce/blip-image-captioning-large":
raw_image = load_image(data["img_url"]).convert('RGB')
text = data["text"]
inputs = pipes[model_id]["processor"](raw_image, return_tensors="pt").to(pipes[model_id]["device"])
out = pipe.generate(**inputs)
caption = pipes[model_id]["processor"].decode(out[0], skip_special_tokens=True)
result = {"generated text": caption}
if model_id == "ydshieh/vit-gpt2-coco-en":
img_url = data["img_url"]
generated_text = pipe(img_url)[0]['generated_text']
result = {"generated text": generated_text}
if model_id == "nlpconnect/vit-gpt2-image-captioning":
image = load_image(data["img_url"]).convert("RGB")
pixel_values = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(pipes[model_id]["device"])
generated_ids = pipe.generate(pixel_values, **{"max_length": 200, "num_beams": 1})
generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0]
result = {"generated text": generated_text}
# image to text: OCR
if model_id == "microsoft/trocr-base-printed" or model_id == "microsoft/trocr-base-handwritten":
image = load_image(data["img_url"]).convert("RGB")
pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(pipes[model_id]["device"])
generated_ids = pipe.generate(pixel_values)
generated_text = pipes[model_id]["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0]
result = {"generated text": generated_text}
# text to image
if model_id == "runwayml/stable-diffusion-v1-5":
file_name = str(uuid.uuid4())[:4]
text = data["text"]
out = pipe(prompt=text)
out["images"][0].save(f"public/images/{file_name}.jpg")
result = {"path": f"/images/{file_name}.jpg"}
# object detection
if model_id == "google/owlvit-base-patch32" or model_id == "facebook/detr-resnet-101":
img_url = data["img_url"]
open_types = ["cat", "couch", "person", "car", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird"]
result = pipe(img_url, candidate_labels=open_types)
# VQA
if model_id == "dandelin/vilt-b32-finetuned-vqa":
question = data["text"]
img_url = data["img_url"]
result = pipe(question=question, image=img_url)
#DQA
if model_id == "impira/layoutlm-document-qa":
question = data["text"]
img_url = data["img_url"]
result = pipe(img_url, question)
# depth-estimation
if model_id == "Intel/dpt-large":
output = pipe(data["img_url"])
image = output['depth']
name = str(uuid.uuid4())[:4]
image.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
if model_id == "Intel/dpt-hybrid-midas" and model_id == "Intel/dpt-large":
image = load_image(data["img_url"])
inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt")
with torch.no_grad():
outputs = pipe(**inputs)
predicted_depth = outputs.predicted_depth
prediction = torch.nn.functional.interpolate(
predicted_depth.unsqueeze(1),
size=image.size[::-1],
mode="bicubic",
align_corners=False,
)
output = prediction.squeeze().cpu().numpy()
formatted = (output * 255 / np.max(output)).astype("uint8")
image = Image.fromarray(formatted)
name = str(uuid.uuid4())[:4]
image.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
# TTS
if model_id == "espnet/kan-bayashi_ljspeech_vits":
text = data["text"]
wav = pipe(text)["wav"]
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", wav.cpu().numpy(), pipe.fs, "PCM_16")
result = {"path": f"/audios/{name}.wav"}
if model_id == "microsoft/speecht5_tts":
text = data["text"]
inputs = pipes[model_id]["processor"](text=text, return_tensors="pt")
embeddings_dataset = pipes[model_id]["embeddings_dataset"]
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(pipes[model_id]["device"])
pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
result = {"path": f"/audios/{name}.wav"}
# ASR
if model_id == "openai/whisper-base" or model_id == "microsoft/speecht5_asr":
audio_url = data["audio_url"]
result = { "text": pipe(audio_url)["text"]}
# audio to audio
if model_id == "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k":
audio_url = data["audio_url"]
wav, sr = torchaudio.load(audio_url)
with torch.no_grad():
result_wav = pipe(wav.to(pipes[model_id]["device"]))
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", result_wav.cpu().squeeze().numpy(), sr)
result = {"path": f"/audios/{name}.wav"}
if model_id == "microsoft/speecht5_vc":
audio_url = data["audio_url"]
wav, sr = torchaudio.load(audio_url)
inputs = pipes[model_id]["processor"](audio=wav, sampling_rate=sr, return_tensors="pt")
embeddings_dataset = pipes[model_id]["embeddings_dataset"]
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
pipes[model_id]["vocoder"].to(pipes[model_id]["device"])
speech = pipe.generate_speech(inputs["input_ids"].to(pipes[model_id]["device"]), speaker_embeddings, vocoder=pipes[model_id]["vocoder"])
name = str(uuid.uuid4())[:4]
sf.write(f"public/audios/{name}.wav", speech.cpu().numpy(), samplerate=16000)
result = {"path": f"/audios/{name}.wav"}
# segmentation
if model_id == "facebook/detr-resnet-50-panoptic":
result = []
segments = pipe(data["img_url"])
image = load_image(data["img_url"])
colors = []
for i in range(len(segments)):
colors.append((random.randint(100, 255), random.randint(100, 255), random.randint(100, 255), 50))
for segment in segments:
mask = segment["mask"]
mask = mask.convert('L')
layer = Image.new('RGBA', mask.size, colors[i])
image.paste(layer, (0, 0), mask)
name = str(uuid.uuid4())[:4]
image.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
if model_id == "facebook/maskformer-swin-base-coco" or model_id == "facebook/maskformer-swin-large-ade":
image = load_image(data["img_url"])
inputs = pipes[model_id]["feature_extractor"](images=image, return_tensors="pt").to(pipes[model_id]["device"])
outputs = pipe(**inputs)
result = pipes[model_id]["feature_extractor"].post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
predicted_panoptic_map = result["segmentation"].cpu().numpy()
predicted_panoptic_map = Image.fromarray(predicted_panoptic_map.astype(np.uint8))
name = str(uuid.uuid4())[:4]
predicted_panoptic_map.save(f"public/images/{name}.jpg")
result = {"path": f"/images/{name}.jpg"}
except Exception as e:
print(e)
traceback.print_exc()
result = {"error": {"message": "Error when running the model inference."}}
if "device" in pipes[model_id]:
try:
pipe.to("cpu")
torch.cuda.empty_cache()
except:
pipe.device = torch.device("cpu")
pipe.model.to("cpu")
torch.cuda.empty_cache()
pipes[model_id]["using"] = False
if result is None:
result = {"error": {"message": "model not found"}}
end = time.time()
during = end - start
print(f"[ complete {model_id} ] {during}s")
print(f"[ result {model_id} ] {result}")
return result