github-actions[bot] commited on
Commit
4657892
·
1 Parent(s): 623b3b9

Sync with https://github.com/mozilla-ai/document-to-podcast

Browse files
Files changed (1) hide show
  1. app.py +15 -28
app.py CHANGED
@@ -1,7 +1,6 @@
1
  """Streamlit app for converting documents to podcasts."""
2
 
3
  import io
4
- import os
5
  import re
6
  from pathlib import Path
7
 
@@ -23,16 +22,13 @@ from document_to_podcast.utils import stack_audio_segments
23
  @st.cache_resource
24
  def load_text_to_text_model():
25
  return load_llama_cpp_model(
26
- model_id="bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf"
27
  )
28
 
29
 
30
  @st.cache_resource
31
- def load_text_to_speech_model():
32
- if os.environ.get("HF_SPACE") == "TRUE":
33
- return load_tts_model("hexgrad/kLegacy/v0.19/kokoro-v0_19.pth")
34
- else:
35
- return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
36
 
37
 
38
  def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
@@ -115,29 +111,11 @@ if "clean_text" in st.session_state:
115
  "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
116
  )
117
  st.divider()
 
 
 
118
 
119
  text_model = load_text_to_text_model()
120
- speech_model = load_text_to_speech_model()
121
-
122
- if os.environ.get("HF_SPACE") == "TRUE":
123
- tts_link = "- [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
124
- SPEAKERS = [
125
- {
126
- "id": 1,
127
- "name": "Sarah",
128
- "description": "The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.",
129
- "voice_profile": "af_sarah",
130
- },
131
- {
132
- "id": 2,
133
- "name": "Michael",
134
- "description": "The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.",
135
- "voice_profile": "am_michael",
136
- },
137
- ]
138
- else:
139
- tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
140
- SPEAKERS = DEFAULT_SPEAKERS
141
 
142
  st.markdown(
143
  "For this demo, we are using the following models: \n"
@@ -180,6 +158,15 @@ if "clean_text" in st.session_state:
180
  speaker.get(x, None) for x in ["name", "description", "voice_profile"]
181
  )
182
  )
 
 
 
 
 
 
 
 
 
183
  system_prompt = DEFAULT_PROMPT.replace("{SPEAKERS}", speakers_str)
184
  with st.spinner("Generating Podcast..."):
185
  text = ""
 
1
  """Streamlit app for converting documents to podcasts."""
2
 
3
  import io
 
4
  import re
5
  from pathlib import Path
6
 
 
22
  @st.cache_resource
23
  def load_text_to_text_model():
24
  return load_llama_cpp_model(
25
+ model_id="bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf"
26
  )
27
 
28
 
29
  @st.cache_resource
30
+ def load_text_to_speech_model(lang_code: str):
31
+ return load_tts_model("hexgrad/Kokoro-82M", **{"lang_code": lang_code})
 
 
 
32
 
33
 
34
  def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
 
111
  "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
112
  )
113
  st.divider()
114
+ tts_link = "- [hexgrad/Kokoro-82M](https://github.com/hexgrad/kokoro)"
115
+
116
+ SPEAKERS = DEFAULT_SPEAKERS
117
 
118
  text_model = load_text_to_text_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  st.markdown(
121
  "For this demo, we are using the following models: \n"
 
158
  speaker.get(x, None) for x in ["name", "description", "voice_profile"]
159
  )
160
  )
161
+ if speakers[0]["voice_profile"][0] != speakers[1]["voice_profile"][0]:
162
+ raise ValueError(
163
+ "Both Kokoro speakers need to have the same language code. "
164
+ "More info here https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md"
165
+ )
166
+ # Get which language is used for generation from the first character of the Kokoro voice profile
167
+ language_code = speakers[0]["voice_profile"][0]
168
+ speech_model = load_text_to_speech_model(lang_code=language_code)
169
+
170
  system_prompt = DEFAULT_PROMPT.replace("{SPEAKERS}", speakers_str)
171
  with st.spinner("Generating Podcast..."):
172
  text = ""