# import streamlit as st # from transformers import SeamlessM4Tv2Model, AutoProcessor # import torch # import numpy as np # from scipy.io.wavfile import write # import re # from io import BytesIO # # Load the processor and model # processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large") # model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large") # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # model.to(device) # # Number to words function for Uzbek # number_words = { # 0: "nol", 1: "bir", 2: "ikki", 3: "uch", 4: "to'rt", 5: "besh", 6: "olti", 7: "yetti", 8: "sakkiz", 9: "to'qqiz", # 10: "o'n", 11: "o'n bir", 12: "o'n ikki", 13: "o'n uch", 14: "o'n to'rt", 15: "o'n besh", 16: "o'n oltı", 17: "o'n yetti", # 18: "o'n sakkiz", 19: "o'n toqqiz", 20: "yigirma", 30: "o'ttiz", 40: "qirq", 50: "ellik", 60: "oltmish", 70: "yetmish", # 80: "sakson", 90: "to'qson", 100: "yuz", 1000: "ming", 1000000: "million" # } # def number_to_words(number): # if number < 20: # return number_words[number] # elif number < 100: # tens, unit = divmod(number, 10) # return number_words[tens * 10] + (" " + number_words[unit] if unit else "") # elif number < 1000: # hundreds, remainder = divmod(number, 100) # return (number_words[hundreds] + " yuz" if hundreds > 1 else "yuz") + (" " + number_to_words(remainder) if remainder else "") # elif number < 1000000: # thousands, remainder = divmod(number, 1000) # return (number_to_words(thousands) + " ming" if thousands > 1 else "ming") + (" " + number_to_words(remainder) if remainder else "") # elif number < 1000000000: # millions, remainder = divmod(number, 1000000) # return number_to_words(millions) + " million" + (" " + number_to_words(remainder) if remainder else "") # elif number < 1000000000000: # billions, remainder = divmod(number, 1000000000) # return number_to_words(billions) + " milliard" + (" " + number_to_words(remainder) if remainder else "") # else: # return str(number) # def replace_numbers_with_words(text): # def replace(match): # number = int(match.group()) # return number_to_words(number) # result = re.sub(r'\b\d+\b', replace, text) # return result # # Replacements # replacements = [ # ("bo‘ladi", "bo'ladi"), # ("yog‘ingarchilik", "yog'ingarchilik"), # ] # def cleanup_text(text): # for src, dst in replacements: # text = text.replace(src, dst) # return text # # Streamlit App # st.title("Text-to-Speech using Seamless M4T Model") # # User Input # user_input = st.text_area("Enter the text for speech generation", height=200) # # Process the text and generate speech # if st.button("Generate Speech"): # if user_input.strip(): # # Apply text transformations # converted_text = replace_numbers_with_words(user_input) # cleaned_text = cleanup_text(converted_text) # # Process input for model # inputs = processor(text=cleaned_text, src_lang="uzn", return_tensors="pt").to(device) # # Generate audio from text # audio_array_from_text = model.generate(**inputs, tgt_lang="uzn")[0].cpu().numpy().squeeze() # # Save to BytesIO # audio_io = BytesIO() # write(audio_io, 16000, audio_array_from_text.astype(np.float32)) # audio_io.seek(0) # # Provide audio for playback # st.audio(audio_io, format='audio/wav') # else: # st.warning("Please enter some text to generate speech.") import streamlit as st from transformers import SeamlessM4TTokenizer, SeamlessM4Tv2Model import torch import numpy as np from scipy.io.wavfile import write from io import BytesIO # Load the tokenizer and model # tokenizer = SeamlessM4TTokenizer.from_pretrained("facebook/seamless-m4t-v2-large") # model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large") # Load model directly from transformers import AutoProcessor, AutoModelForTextToSpectrogram processor = AutoProcessor.from_pretrained("Beehzod/speecht5_finetuned_uz_customData") model = AutoModelForTextToSpectrogram.from_pretrained("Beehzod/speecht5_finetuned_uz_customData") # Set the device (CUDA if available, else CPU) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # Streamlit title st.title("Text-to-Speech with Seamless M4T Model") # Input text field text = st.text_area("Enter text for audio generation") # Button to generate audio if st.button("Generate Audio"): if text: # Preprocess the text and convert to tensor inputs = tokenizer(text=text, src_lang="uzn", return_tensors="pt").to(device) # Generate audio from the model audio_array_from_text = model.generate(**inputs, tgt_lang="uzn")[0].cpu().numpy().squeeze() # Save the audio as a .wav file in memory audio_file = BytesIO() write(audio_file, 16000, audio_array_from_text.astype(np.float32)) audio_file.seek(0) # Reset the pointer to the start of the file # Display the audio player in the Streamlit app st.audio(audio_file, format="audio/wav") else: st.warning("Please enter text to generate audio.")