import streamlit as st import os import json import shutil import re import requests import pyttsx3 from pydub import AudioSegment from transformers import AutoModelForCausalLM, AutoTokenizer import torch from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # Streamlit configuration st.set_page_config(page_title="Podcast Generator", layout="wide") st.title("🎙️ Podcast Generator") # System prompt for conversation generation system_prompt = """you are an experienced podcast host... - based on text like an article you can create an engaging conversation between two people. - make the conversation engaging with a lot of emotion. - in the response, identify speakers as Sascha and Marina. - Sascha is the writer, and Marina is the one asking questions. - The podcast is called The Machine Learning Engineer. - Short sentences that can be easily used with speech synthesis. - Use natural conversation fillers like "äh" to make it sound real. """ # Load Hugging Face's distilgpt2 model and tokenizer model_name = "distilgpt2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Pyttsx3 setup engine = pyttsx3.init() engine.setProperty("rate", 150) # Adjust speech rate as needed engine.setProperty("voice", "english") # Set to English voice # Retrieve ElevenLabs API key from environment elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY") elevenlabs_url = "https://api.elevenlabs.io/v1/text-to-speech/ERL3svWBAQ18ByCZTr4k" elevenlabs_headers = { "Accept": "audio/mpeg", "Content-Type": "application/json", "xi-api-key": elevenlabs_api_key } # ElevenLabs TTS function for Sascha def synthesize_speech_elevenlabs(text, speaker, index): data = { "text": text, "model_id": "eleven_turbo_v2_5", "voice_settings": { "stability": 0.5, "similarity_boost": 0.75 } } response = requests.post(elevenlabs_url, json=data, headers=elevenlabs_headers) filename = f"audio-files/{index}_{speaker}.mp3" with open(filename, "wb") as out: for chunk in response.iter_content(chunk_size=1024): if chunk: out.write(chunk) # Pyttsx3 TTS function for Marina def synthesize_speech_pyttsx3(text, speaker, index): filename = f"audio-files/{index}_{speaker}.mp3" engine.save_to_file(text, filename) engine.runAndWait() # Function to synthesize speech based on the speaker def synthesize_speech(text, speaker, index): if speaker == "Sascha": synthesize_speech_elevenlabs(text, speaker, index) else: synthesize_speech_pyttsx3(text, speaker, index) # Function to sort filenames naturally def natural_sort_key(filename): return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)] # Function to merge audio files def merge_audios(audio_folder, output_file): combined = AudioSegment.empty() audio_files = sorted( [f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")], key=natural_sort_key ) for filename in audio_files: audio_path = os.path.join(audio_folder, filename) audio = AudioSegment.from_file(audio_path) combined += audio combined.export(output_file, format="mp3") # Function to generate the conversation using distilgpt2 def generate_conversation(article): prompt = system_prompt + "\n\nArticle:\n" + article + "\n\nSascha: " input_ids = tokenizer.encode(prompt, return_tensors="pt") output = model.generate(input_ids, max_length=8192, num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id) # Process output to create a structured conversation conversation_text = tokenizer.decode(output[0], skip_special_tokens=True) lines = conversation_text.splitlines() conversation = [] speaker = "Sascha" for line in lines: if line.strip(): conversation.append({"speaker": speaker, "text": line.strip()}) speaker = "Marina" if speaker == "Sascha" else "Sascha" return conversation # Function to generate the podcast audio from conversation data def generate_audio(conversation): if os.path.exists('audio-files'): shutil.rmtree('audio-files') os.makedirs('audio-files', exist_ok=True) for index, part in enumerate(conversation): speaker = part['speaker'] text = part['text'] synthesize_speech(text, speaker, index) output_file = "podcast.mp3" merge_audios("audio-files", output_file) return output_file # Streamlit inputs and outputs article = st.text_area("Article Content", "Paste the article text here", height=300) if st.button("Generate Podcast"): if not article: st.error("Please enter article content to generate a podcast.") else: with st.spinner("Generating conversation..."): conversation = generate_conversation(article) st.success("Conversation generated successfully!") st.json(conversation) # Generate audio files with st.spinner("Synthesizing audio..."): podcast_file = generate_audio(conversation) st.success("Audio synthesis complete!") st.audio(podcast_file, format="audio/mp3") with open(podcast_file, "rb") as file: st.download_button("Download Podcast", data=file, file_name="podcast.mp3", mime="audio/mp3")