File size: 3,396 Bytes
fc48446
0cc6b2a
4cc910e
 
0cc6b2a
 
 
 
 
 
 
 
 
 
 
 
 
 
fc48446
 
 
 
4fcc6b4
 
 
 
 
0cc6b2a
4fcc6b4
fc48446
0cc6b2a
fc48446
4fcc6b4
0cc6b2a
 
4fcc6b4
 
0cc6b2a
4fcc6b4
fc48446
 
 
4fcc6b4
4cc910e
 
 
 
 
 
 
 
 
4fcc6b4
 
 
 
 
 
0cc6b2a
4fcc6b4
fc48446
 
 
 
 
 
 
df4ad39
 
 
 
 
 
fc48446
0cc6b2a
fc48446
0cc6b2a
fc48446
 
4fcc6b4
 
fc48446
0cc6b2a
4fcc6b4
 
 
 
fc48446
0cc6b2a
4fcc6b4
 
 
 
 
 
 
df4ad39
 
4cc910e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import streamlit as st
import importlib.util
import langdetect

try:
    # 检查 accelerate 库是否安装
    spec = importlib.util.find_spec("accelerate")
    if spec is None:
        st.error("缺少 'accelerate' 库,请安装该库以加载 FP8 量化模型。可以使用 'pip install accelerate' 进行安装。")
        st.stop()
    from transformers import pipeline
    from gtts import gTTS
    import io
    import tempfile
    import os
except ImportError as e:
    st.error(f"导入库时出错: {e}")
    st.stop()

# function part
# img2text
def img2text(url):
    try:
        image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
        text = image_to_text_model(url)[0]["generated_text"]
        return text
    except Exception as e:
        st.error(f"图像描述生成出错: {e}")
        return None

#  text2story 
def text2story(text):
    try:
        story_generator = pipeline("text-generation", model="perplexity-ai/r1-1776", trust_remote_code=True)
        story = story_generator(text, max_length=200, num_return_sequences=1)[0]['generated_text']
        return story
    except Exception as e:
        st.error(f"故事生成出错: {e}")
        return None

# text2audio
def text2audio(story_text):
    try:
        # 检测故事的语言
        detected_lang = langdetect.detect(story_text)
        tts = gTTS(text=story_text, lang=detected_lang)
        audio_file = io.BytesIO()
        tts.write_to_fp(audio_file)
        audio_file.seek(0)
        return audio_file
    except langdetect.LangDetectException:
        st.error("无法检测故事的语言,默认使用英语进行语音合成。")
        tts = gTTS(text=story_text, lang='en')
        audio_file = io.BytesIO()
        tts.write_to_fp(audio_file)
        audio_file.seek(0)
        return audio_file
    except Exception as e:
        st.error(f"文本转语音出错: {e}")
        return None

st.set_page_config(page_title="Your Image to Audio Story",
                   page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    print(uploaded_file)
    # 使用临时文件处理上传的图像
    with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
        temp_file.write(uploaded_file.getvalue())
        temp_file_path = temp_file.name

    st.image(uploaded_file, caption="Uploaded Image",
             use_container_width=True)  # 修改为 use_container_width

    #Stage 1: Image to Text
    st.text('Processing img2text...')
    scenario = img2text(temp_file_path)
    if scenario:
        st.write(scenario)

        #Stage 2: Text to Story
        st.text('Generating a story...')
        story = text2story(scenario)
        if story:
            st.write(story)

            #Stage 3: Story to Audio data
            st.text('Generating audio data...')
            audio_data = text2audio(story)
            if audio_data:
                # Play button
                if st.button("Play Audio"):
                    st.audio(audio_data,
                             format="audio/mpeg",
                             start_time=0)

    # 删除临时文件并进行异常处理
    try:
        os.remove(temp_file_path)
    except Exception as e:
        st.error(f"删除临时文件时出错: {e}")