Spaces:

amirza
/

draw_me_a_sheep_heb

Runtime error

Amir Zait

bugfix

2d31a01 about 2 years ago

3.51 kB

	import soundfile as sf
	import gradio as gr
	import jax
	import numpy as np
	import os
	from PIL import Image
	import random
	import sox
	import torch

	from transformers import AutoProcessor, AutoModelForCTC
	from transformers import pipeline
	from dalle_mini import DalleBart, DalleBartProcessor
	from vqgan_jax.modeling_flax_vqgan import VQModel

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	api_token = os.getenv("API_TOKEN")
	asr_processor = AutoProcessor.from_pretrained("imvladikon/wav2vec2-xls-r-300m-hebrew")
	asr_model = AutoModelForCTC.from_pretrained("imvladikon/wav2vec2-xls-r-300m-hebrew")

	he_en_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-he-en")

	# Model references
	# dalle-mini, mega too large
	# DALLE_MODEL = "dalle-mini/dalle-mini/mega-1-fp16:latest" # can be wandb artifact or 🤗 Hub or local folder or google bucket
	DALLE_MODEL = "dalle-mini/dalle-mini/mini-1:v0"
	DALLE_COMMIT_ID = None

	# VQGAN model
	VQGAN_REPO = "dalle-mini/vqgan_imagenet_f16_16384"
	VQGAN_COMMIT_ID = "e93a26e7707683d349bf5d5c41c5b0ef69b677a9"

	model = DalleBart.from_pretrained(DALLE_MODEL, revision=DALLE_COMMIT_ID)
	vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)
	processor = DalleBartProcessor.from_pretrained(DALLE_MODEL, revision=DALLE_COMMIT_ID)

	def generate_image(text):
	tokenized_prompt = processor([text])

	gen_top_k = None
	gen_top_p = None
	temperature = 0.85
	cond_scale = 3.0

	encoded_images = model.generate(
	**tokenized_prompt,
	prng_key=jax.random.PRNGKey(random.randint(0, 1e7)),
	params=model.params,
	top_k=gen_top_k,
	top_p=gen_top_p,
	temperature=temperature,
	condition_scale=cond_scale,
	)
	encoded_images = encoded_images.sequences[..., 1:]
	decoded_images = vqgan.decode_code(encoded_images, vqgan.params)
	decoded_images = decoded_images.clip(0.0, 1.0).reshape((-1, 256, 256, 3))
	img = decoded_images[0]
	return Image.fromarray(np.asarray(img * 255, dtype=np.uint8))

	def convert(inputfile, outfile):
	sox_tfm = sox.Transformer()
	sox_tfm.set_output_format(
	file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
	)
	sox_tfm.build(inputfile, outfile)

	def parse_transcription(wav_file):
	# Get the wav file from the microphone
	filename = wav_file.name.split('.')[0]
	convert(wav_file.name, filename + "16k.wav")
	speech, _ = sf.read(filename + "16k.wav")

	# transcribe to hebrew
	input_values = asr_processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
	logits = asr_model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = asr_processor.decode(predicted_ids[0], skip_special_tokens=True)

	print(transcription)

	# translate to english
	translated = he_en_translator(transcription)[0]['translation_text']

	print(translated)

	# generate image
	image = generate_image(translated)
	return transcription, image

	outputs = [gr.outputs.Textbox(label="transcript"), gr.outputs.Image(label='')]
	input_mic = gr.inputs.Audio(source="microphone", type="file", optional=True)

	gr.Interface(parse_transcription, inputs=[input_mic], outputs=outputs,
	analytics_enabled=False,
	show_tips=False,
	theme='huggingface',
	layout='horizontal',
	title="Draw Me A Sheep in Hebrew",
	enable_queue=True).launch(inline=False)