Spaces:
Running
Running
import gradio as gr | |
from pydub import AudioSegment | |
from google import genai | |
from google.genai import types | |
import json | |
import uuid | |
import edge_tts | |
import asyncio | |
import aiofiles | |
import os | |
import time | |
import mimetypes | |
from typing import List, Dict | |
# Constants | |
MAX_FILE_SIZE_MB = 20 | |
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes | |
class PodcastGenerator: | |
def __init__(self): | |
pass | |
async def generate_script(self, prompt: str, language: str, api_key: str, file_obj=None, progress=None) -> Dict: | |
example = """ | |
{ | |
"topic": "AGI", | |
"podcast": [ | |
{ | |
"speaker": 2, | |
"line": "So, AGI, huh? Seems like everyone's talking about it these days." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Yeah, it's definitely having a moment, isn't it?" | |
}, | |
{ | |
"speaker": 2, | |
"line": "It is and for good reason, right? I mean, you've been digging into this stuff, listening to the podcasts and everything. What really stood out to you? What got you hooked?" | |
}, | |
{ | |
"speaker": 1, | |
"line": "Honestly, it's the sheer scale of what AGI could do. We're talking about potentially reshaping well everything." | |
}, | |
{ | |
"speaker": 2, | |
"line": "No kidding, but let's be real. Sometimes it feels like every other headline is either hyping AGI up as this technological utopia or painting it as our inevitable robot overlords." | |
}, | |
{ | |
"speaker": 1, | |
"line": "It's easy to get lost in the noise, for sure." | |
}, | |
{ | |
"speaker": 2, | |
"line": "Exactly. So how about we try to cut through some of that, shall we?" | |
}, | |
{ | |
"speaker": 1, | |
"line": "Sounds like a plan." | |
}, | |
{ | |
"speaker": 2, | |
"line": "Okay, so first things first, AGI, what is it really? And I don't just mean some dictionary definition, we're talking about something way bigger than just a super smart computer, right?" | |
}, | |
{ | |
"speaker": 1, | |
"line": "Right, it's not just about more processing power or better algorithms, it's about a fundamental shift in how we think about intelligence itself." | |
}, | |
{ | |
"speaker": 2, | |
"line": "So like, instead of programming a machine for a specific task, we're talking about creating something that can learn and adapt like we do." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Exactly, think of it this way: Right now, we've got AI that can beat a grandmaster at chess but ask that same AI to, say, write a poem or compose a symphony. No chance." | |
}, | |
{ | |
"speaker": 2, | |
"line": "Okay, I see. So, AGI is about bridging that gap, creating something that can move between those different realms of knowledge seamlessly." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Precisely. It's about replicating that uniquely human ability to learn something new and apply that knowledge in completely different contexts and that's a tall order, let me tell you." | |
}, | |
{ | |
"speaker": 2, | |
"line": "I bet. I mean, think about how much we still don't even understand about our own brains." | |
}, | |
{ | |
"speaker": 1, | |
"line": "That's exactly it. We're essentially trying to reverse-engineer something we don't fully comprehend." | |
}, | |
{ | |
"speaker": 2, | |
"line": "And how are researchers even approaching that? What are some of the big ideas out there?" | |
}, | |
{ | |
"speaker": 1, | |
"line": "Well, there are a few different schools of thought. One is this idea of neuromorphic computing where they're literally trying to build computer chips that mimic the structure and function of the human brain." | |
}, | |
{ | |
"speaker": 2, | |
"line": "Wow, so like actually replicating the physical architecture of the brain. That's wild." | |
}, | |
{ | |
"speaker": 1, | |
"line": "It's pretty mind-blowing stuff and then you've got folks working on something called whole brain emulation." | |
}, | |
{ | |
"speaker": 2, | |
"line": "Okay, and what's that all about?" | |
}, | |
{ | |
"speaker": 1, | |
"line": "The basic idea there is to create a complete digital copy of a human brain down to the last neuron and synapse and run it on a sufficiently powerful computer simulation." | |
}, | |
{ | |
"speaker": 2, | |
"line": "Hold on, a digital copy of an entire brain, that sounds like something straight out of science fiction." | |
}, | |
{ | |
"speaker": 1, | |
"line": "It does, doesn't it? But it gives you an idea of the kind of ambition we're talking about here and the truth is we're still a long way off from truly achieving AGI, no matter which approach you look at." | |
}, | |
{ | |
"speaker": 2, | |
"line": "That makes sense but it's still exciting to think about the possibilities, even if they're a ways off." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Absolutely and those possibilities are what really get people fired up about AGI, right? Yeah." | |
}, | |
{ | |
"speaker": 2, | |
"line": "For sure. In fact, I remember you mentioning something in that podcast about AGI's potential to revolutionize scientific research. Something about supercharging breakthroughs." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Oh, absolutely. Imagine an AI that doesn't just crunch numbers but actually understands scientific data the way a human researcher does. We're talking about potential breakthroughs in everything from medicine and healthcare to material science and climate change." | |
}, | |
{ | |
"speaker": 2, | |
"line": "It's like giving scientists this incredibly powerful new tool to tackle some of the biggest challenges we face." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Exactly, it could be a total game changer." | |
}, | |
{ | |
"speaker": 2, | |
"line": "Okay, but let's be real, every coin has two sides. What about the potential downsides of AGI? Because it can't all be sunshine and roses, right?" | |
}, | |
{ | |
"speaker": 1, | |
"line": "Right, there are definitely valid concerns. Probably the biggest one is the impact on the job market. As AGI gets more sophisticated, there's a real chance it could automate a lot of jobs that are currently done by humans." | |
}, | |
{ | |
"speaker": 2, | |
"line": "So we're not just talking about robots taking over factories but potentially things like, what, legal work, analysis, even creative fields?" | |
}, | |
{ | |
"speaker": 1, | |
"line": "Potentially, yes. And that raises a whole host of questions about what happens to those workers, how we retrain them, how we ensure that the benefits of AGI are shared equitably." | |
}, | |
{ | |
"speaker": 2, | |
"line": "Right, because it's not just about the technology itself, but how we choose to integrate it into society." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Absolutely. We need to be having these conversations now about ethics, about regulation, about how to make sure AGI is developed and deployed responsibly." | |
}, | |
{ | |
"speaker": 2, | |
"line": "So it's less about preventing some kind of sci-fi robot apocalypse and more about making sure we're steering this technology in the right direction from the get-go." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Exactly, AGI has the potential to be incredibly beneficial, but it's not going to magically solve all our problems. It's on us to make sure we're using it for good." | |
}, | |
{ | |
"speaker": 2, | |
"line": "It's like you said earlier, it's about shaping the future of intelligence." | |
}, | |
{ | |
"speaker": 1, | |
"line": "I like that. It really is." | |
}, | |
{ | |
"speaker": 2, | |
"line": "And honestly, that's a responsibility that extends beyond just the researchers and the policymakers." | |
}, | |
{ | |
"speaker": 1, | |
"line": "100%" | |
}, | |
{ | |
"speaker": 2, | |
"line": "So to everyone listening out there I'll leave you with this. As AGI continues to develop, what role do you want to play in shaping its future?" | |
}, | |
{ | |
"speaker": 1, | |
"line": "That's a question worth pondering." | |
}, | |
{ | |
"speaker": 2, | |
"line": "It certainly is and on that note, we'll wrap up this deep dive. Thanks for listening, everyone." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Peace." | |
} | |
] | |
} | |
""" | |
if language == "Auto Detect": | |
language_instruction = "- The podcast MUST be in the same language as the user input." | |
else: | |
language_instruction = f"- The podcast MUST be in {language} language" | |
system_prompt = f""" | |
You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input. | |
{language_instruction} | |
- The podcast should have 2 speakers. | |
- The podcast should be long. | |
- Do not use names for the speakers. | |
- The podcast should be interesting, lively, and engaging, and hook the listener from the start. | |
- The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast. | |
- The script must be in JSON format. | |
Follow this example structure: | |
{example} | |
""" | |
user_prompt = "" | |
if prompt and file_obj: | |
user_prompt = f"Please generate a podcast script based on the uploaded file following user input:\n{prompt}" | |
elif prompt: | |
user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}" | |
else: | |
user_prompt = "Please generate a podcast script based on the uploaded file." | |
messages = [] | |
# If file is provided, add it to the messages | |
if file_obj: | |
file_data = await self._read_file_bytes(file_obj) | |
mime_type = self._get_mime_type(file_obj.name) | |
messages.append( | |
types.Content( | |
role="user", | |
parts=[ | |
types.Part.from_bytes( | |
data=file_data, | |
mime_type=mime_type, | |
) | |
], | |
) | |
) | |
# Add text prompt | |
messages.append( | |
types.Content( | |
role="user", | |
parts=[ | |
types.Part.from_text(text=user_prompt) | |
], | |
) | |
) | |
client = genai.Client(api_key=api_key) | |
safety_settings = [ | |
{ | |
"category": "HARM_CATEGORY_DANGEROUS_CONTENT", | |
"threshold": "BLOCK_NONE" | |
}, | |
{ | |
"category": "HARM_CATEGORY_HARASSMENT", | |
"threshold": "BLOCK_NONE" | |
}, | |
{ | |
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", | |
"threshold": "BLOCK_NONE" | |
}, | |
{ | |
"category": "HARM_CATEGORY_HATE_SPEECH", | |
"threshold": "BLOCK_NONE" | |
} | |
] | |
try: | |
if progress: | |
progress(0.3, "Generating podcast script...") | |
# Add timeout to the API call | |
response = await asyncio.wait_for( | |
client.aio.models.generate_content( | |
model="gemini-2.0-flash", | |
contents=messages, | |
config=types.GenerateContentConfig( | |
temperature=1, | |
response_mime_type="application/json", | |
safety_settings=[ | |
types.SafetySetting( | |
category=safety_setting["category"], | |
threshold=safety_setting["threshold"] | |
) for safety_setting in safety_settings | |
], | |
system_instruction=system_prompt | |
) | |
), | |
timeout=60 # 60 seconds timeout | |
) | |
except asyncio.TimeoutError: | |
raise Exception("The script generation request timed out. Please try again later.") | |
except Exception as e: | |
if "API key not valid" in str(e): | |
raise Exception("Invalid API key. Please provide a valid Gemini API key.") | |
elif "rate limit" in str(e).lower(): | |
raise Exception("Rate limit exceeded for the API key. Please try again later or provide your own Gemini API key.") | |
else: | |
raise Exception(f"Failed to generate podcast script: {e}") | |
print(f"Generated podcast script:\n{response.text}") | |
if progress: | |
progress(0.4, "Script generated successfully!") | |
return json.loads(response.text) | |
async def _read_file_bytes(self, file_obj) -> bytes: | |
"""Read file bytes from a file object""" | |
# Check file size before reading | |
if hasattr(file_obj, 'size'): | |
file_size = file_obj.size | |
else: | |
file_size = os.path.getsize(file_obj.name) | |
if file_size > MAX_FILE_SIZE_BYTES: | |
raise Exception(f"File size exceeds the {MAX_FILE_SIZE_MB}MB limit. Please upload a smaller file.") | |
if hasattr(file_obj, 'read'): | |
return file_obj.read() | |
else: | |
async with aiofiles.open(file_obj.name, 'rb') as f: | |
return await f.read() | |
def _get_mime_type(self, filename: str) -> str: | |
"""Determine MIME type based on file extension""" | |
ext = os.path.splitext(filename)[1].lower() | |
if ext == '.pdf': | |
return "application/pdf" | |
elif ext == '.txt': | |
return "text/plain" | |
else: | |
# Fallback to the default mime type detector | |
mime_type, _ = mimetypes.guess_type(filename) | |
return mime_type or "application/octet-stream" | |
async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str: | |
voice = speaker1 if speaker == 1 else speaker2 | |
speech = edge_tts.Communicate(text, voice) | |
temp_filename = f"temp_{uuid.uuid4()}.wav" | |
try: | |
# Add timeout to TTS generation | |
await asyncio.wait_for(speech.save(temp_filename), timeout=30) # 30 seconds timeout | |
return temp_filename | |
except asyncio.TimeoutError: | |
if os.path.exists(temp_filename): | |
os.remove(temp_filename) | |
raise Exception("Text-to-speech generation timed out. Please try with a shorter text.") | |
except Exception as e: | |
if os.path.exists(temp_filename): | |
os.remove(temp_filename) | |
raise e | |
async def combine_audio_files(self, audio_files: List[str], progress=None) -> str: | |
if progress: | |
progress(0.9, "Combining audio files...") | |
combined_audio = AudioSegment.empty() | |
for audio_file in audio_files: | |
combined_audio += AudioSegment.from_file(audio_file) | |
os.remove(audio_file) # Clean up temporary files | |
output_filename = f"output_{uuid.uuid4()}.wav" | |
combined_audio.export(output_filename, format="wav") | |
if progress: | |
progress(1.0, "Podcast generated successfully!") | |
return output_filename | |
async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_obj=None, progress=None) -> str: | |
try: | |
if progress: | |
progress(0.1, "Starting podcast generation...") | |
# Set overall timeout for the entire process | |
return await asyncio.wait_for( | |
self._generate_podcast_internal(input_text, language, speaker1, speaker2, api_key, file_obj, progress), | |
timeout=600 # 10 minutes total timeout | |
) | |
except asyncio.TimeoutError: | |
raise Exception("The podcast generation process timed out. Please try with shorter text or try again later.") | |
except Exception as e: | |
raise Exception(f"Error generating podcast: {str(e)}") | |
async def _generate_podcast_internal(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_obj=None, progress=None) -> str: | |
if progress: | |
progress(0.2, "Generating podcast script...") | |
podcast_json = await self.generate_script(input_text, language, api_key, file_obj, progress) | |
if progress: | |
progress(0.5, "Converting text to speech...") | |
# Process TTS in batches for concurrent processing | |
audio_files = [] | |
total_lines = len(podcast_json['podcast']) | |
# Define batch size to control concurrency | |
batch_size = 10 # Adjust based on system resources | |
# Process in batches | |
for batch_start in range(0, total_lines, batch_size): | |
batch_end = min(batch_start + batch_size, total_lines) | |
batch = podcast_json['podcast'][batch_start:batch_end] | |
# Create tasks for concurrent processing | |
tts_tasks = [] | |
for item in batch: | |
tts_task = self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) | |
tts_tasks.append(tts_task) | |
try: | |
# Process batch concurrently | |
batch_results = await asyncio.gather(*tts_tasks, return_exceptions=True) | |
# Check for exceptions and handle results | |
for i, result in enumerate(batch_results): | |
if isinstance(result, Exception): | |
# Clean up any files already created | |
for file in audio_files: | |
if os.path.exists(file): | |
os.remove(file) | |
raise Exception(f"Error generating speech: {str(result)}") | |
else: | |
audio_files.append(result) | |
# Update progress | |
if progress: | |
current_progress = 0.5 + (0.4 * (batch_end / total_lines)) | |
progress(current_progress, f"Processed {batch_end}/{total_lines} speech segments...") | |
except Exception as e: | |
# Clean up any files already created | |
for file in audio_files: | |
if os.path.exists(file): | |
os.remove(file) | |
raise Exception(f"Error in batch TTS generation: {str(e)}") | |
combined_audio = await self.combine_audio_files(audio_files, progress) | |
return combined_audio | |
async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "", progress=None) -> str: | |
start_time = time.time() | |
voice_names = { | |
"Andrew - English (United States)": "en-US-AndrewMultilingualNeural", | |
"Ava - English (United States)": "en-US-AvaMultilingualNeural", | |
"Brian - English (United States)": "en-US-BrianMultilingualNeural", | |
"Emma - English (United States)": "en-US-EmmaMultilingualNeural", | |
"Florian - German (Germany)": "de-DE-FlorianMultilingualNeural", | |
"Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural", | |
"Remy - French (France)": "fr-FR-RemyMultilingualNeural", | |
"Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural" | |
} | |
speaker1 = voice_names[speaker1] | |
speaker2 = voice_names[speaker2] | |
try: | |
if progress: | |
progress(0.05, "Processing input...") | |
if not api_key: | |
api_key = os.getenv("GENAI_API_KEY") | |
if not api_key: | |
raise Exception("No API key provided. Please provide a Gemini API key.") | |
podcast_generator = PodcastGenerator() | |
podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key, input_file, progress) | |
end_time = time.time() | |
print(f"Total podcast generation time: {end_time - start_time:.2f} seconds") | |
return podcast | |
except Exception as e: | |
# Ensure we show a user-friendly error | |
error_msg = str(e) | |
if "rate limit" in error_msg.lower(): | |
raise Exception("Rate limit exceeded. Please try again later or use your own API key.") | |
elif "timeout" in error_msg.lower(): | |
raise Exception("The request timed out. This could be due to server load or the length of your input. Please try again with shorter text.") | |
else: | |
raise Exception(f"Error: {error_msg}") | |
# Gradio UI | |
def generate_podcast_gradio(input_text, input_file, language, speaker1, speaker2, api_key, progress=gr.Progress()): | |
# Handle the file if uploaded | |
file_obj = None | |
if input_file is not None: | |
file_obj = input_file | |
# Use the progress function from Gradio | |
def progress_callback(value, text): | |
progress(value, text) | |
# Run the async function in the event loop | |
result = asyncio.run(process_input( | |
input_text, | |
file_obj, | |
language, | |
speaker1, | |
speaker2, | |
api_key, | |
progress_callback | |
)) | |
return result | |
def main(): | |
# Define language options | |
language_options = [ | |
"Auto Detect", | |
"Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani", | |
"Bahasa Indonesian", "Bangla", "Basque", "Bengali", "Bosnian", "Bulgarian", | |
"Burmese", "Catalan", "Chinese Cantonese", "Chinese Mandarin", | |
"Chinese Taiwanese", "Croatian", "Czech", "Danish", "Dutch", "English", | |
"Estonian", "Filipino", "Finnish", "French", "Galician", "Georgian", | |
"German", "Greek", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Irish", | |
"Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean", | |
"Lao", "Latvian", "Lithuanian", "Macedonian", "Malay", "Malayalam", | |
"Maltese", "Mongolian", "Nepali", "Norwegian Bokmål", "Pashto", "Persian", | |
"Polish", "Portuguese", "Romanian", "Russian", "Serbian", "Sinhala", | |
"Slovak", "Slovene", "Somali", "Spanish", "Sundanese", "Swahili", | |
"Swedish", "Tamil", "Telugu", "Thai", "Turkish", "Ukrainian", "Urdu", | |
"Uzbek", "Vietnamese", "Welsh", "Zulu" | |
] | |
# Define voice options | |
voice_options = [ | |
"Andrew - English (United States)", | |
"Ava - English (United States)", | |
"Brian - English (United States)", | |
"Emma - English (United States)", | |
"Florian - German (Germany)", | |
"Seraphina - German (Germany)", | |
"Remy - French (France)", | |
"Vivienne - French (France)" | |
] | |
# Create Gradio interface | |
with gr.Blocks(title="PodcastGen 🎙️") as demo: | |
gr.Markdown("# PodcastGen 🎙️") | |
gr.Markdown("Generate a 2-speaker podcast from text input or documents!") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
input_text = gr.Textbox(label="Input Text", lines=10, placeholder="Enter text for podcast generation...") | |
with gr.Column(scale=1): | |
input_file = gr.File(label="Or Upload a PDF or TXT file", file_types=[".pdf", ".txt"]) | |
with gr.Row(): | |
with gr.Column(): | |
api_key = gr.Textbox(label="Your Gemini API Key (Optional)", placeholder="Enter API key here if you're getting rate limited", type="password") | |
language = gr.Dropdown(label="Language", choices=language_options, value="Auto Detect") | |
with gr.Column(): | |
speaker1 = gr.Dropdown(label="Speaker 1 Voice", choices=voice_options, value="Andrew - English (United States)") | |
speaker2 = gr.Dropdown(label="Speaker 2 Voice", choices=voice_options, value="Ava - English (United States)") | |
generate_btn = gr.Button("Generate Podcast", variant="primary") | |
with gr.Row(): | |
output_audio = gr.Audio(label="Generated Podcast", type="filepath", format="wav") | |
generate_btn.click( | |
fn=generate_podcast_gradio, | |
inputs=[input_text, input_file, language, speaker1, speaker2, api_key], | |
outputs=[output_audio] | |
) | |
demo.launch() | |
if __name__ == "__main__": | |
main() |