Spaces:
Runtime error
Runtime error
from dotenv import load_dotenv, find_dotenv | |
from transformers import pipeline | |
from langchain import LLMChain, OpenAI, PromptTemplate | |
import requests | |
import os | |
# UI layer | |
import streamlit as st | |
load_dotenv(find_dotenv()) | |
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') | |
# It involves 3 steps | |
# image to text | |
def image_to_text(url, use_api=True): | |
if use_api: | |
API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large" | |
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"} | |
filename = url.split("/")[-1] | |
with open(filename, "rb") as f: | |
data = f.read() | |
response = requests.post(API_URL, headers=headers, data=data) | |
return response.json()[0]['generated_text'] | |
# Download the model and use it, which is slow | |
captioner = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base") | |
# captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") | |
result = captioner(url) | |
return result[0]['generated_text'] | |
## [{'generated_text': 'two birds are standing next to each other '}] | |
# LLM | |
def generate_story(story_idea): | |
# template = """ | |
# You are a professional song writter; | |
# Generate a song based on a simple narrative, the song should be no more than 100 words. | |
# Song should be in Nepali language | |
# CONTEXT: {story_idea} | |
# STORY: | |
# """ | |
template = """ | |
you are a song writer, write a song using following context: | |
{story_idea}. | |
Song should not be more than 150 words. It should be in English language. | |
""" | |
prompt = PromptTemplate(input_variables=["story_idea"], template=template) | |
story_llm = LLMChain(llm=OpenAI(model_name='gpt-3.5-turbo-0301', temperature=1), prompt=prompt, verbose=True) | |
story = story_llm.run(story_idea) | |
return story | |
# text to speech | |
def text_to_speech(story): | |
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits" | |
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"} | |
payloads = { | |
"inputs": story | |
} | |
response = requests.post(API_URL, headers=headers, json=payloads) | |
with open("story_audio.flac", "wb") as file: | |
file.write(response.content) | |
# caption = image_to_text("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") | |
# story = generate_story(story_idea="Two parrots singing a song") | |
# text_to_speech(story="Two parrots singing a song") | |
def main(): | |
st.set_page_config(page_title="Upload any image to hear a nice story") | |
st.header("Listen to what your image has to tell you. JK DEMO APP") | |
uploaded_file = st.file_uploader("Choose an image...", type="jpg") | |
if uploaded_file is not None: | |
print(uploaded_file) | |
bytes_data = uploaded_file.getvalue() | |
with open(uploaded_file.name, "wb") as file: | |
file.write(bytes_data) | |
st.image(uploaded_file, caption="Uploaded image", use_column_width=True) | |
image_description = image_to_text(uploaded_file.name, use_api=True) | |
# Display image description on FE | |
with st.expander("Image Description"): | |
st.write(image_description) | |
story = generate_story(story_idea=image_description) | |
# story_starter_text = "Yo ho Radio Nepal, prastut xa sun nai parne katha: " | |
story_starter_text = "" | |
story = story_starter_text + story | |
# Display story text on FE | |
with st.expander("Story"): | |
st.write(story) | |
# Display audio player on FE | |
text_to_speech(story=story) | |
st.audio("story_audio.flac") | |
if __name__ == '__main__': | |
main() |