Spaces:

mustafoyev202
/

uzbek_stt

Sleeping

App Files Files Community

mustafoyev202 commited on Apr 6

Commit

e210e52

verified ·

1 Parent(s): 328d5e6

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -34

app.py CHANGED Viewed

@@ -2,16 +2,15 @@ import os
 import torch
 import logging
 import librosa
-import tempfile
 from typing import Union, BinaryIO
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 from langchain_groq import ChatGroq
 import streamlit as st
 from dotenv import load_dotenv
-# Load environment variables
 load_dotenv()
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -20,15 +19,13 @@ logger = logging.getLogger(__name__)
 class UzbekSTT:
     """Enhanced Uzbek Speech-to-Text pipeline with grammar correction."""
     base_model_name = "oyqiz/uzbek_stt"
     def __init__(self):
         self.processor = None
-        self.model = Wav2Vec2ForCTC.from_pretrained(
-                    self.base_model_name,
-                    device_map="auto",
-                    trust_remote_code=True  # just in case the model repo has custom code
-                )
         self.groq_client = None
         self.load_models()
@@ -44,9 +41,7 @@ class UzbekSTT:
                 raise ValueError("GROQ_API_KEY environment variable is required")
             self.groq_client = ChatGroq(
-                model="llama3-70b-8192",  # Correct model name as per Groq
-                temperature=0.3,
-                api_key=groq_api_key,    # ✅ Make sure to pass API key here
             )
             logger.info("Models loaded successfully")
         except Exception as e:
@@ -59,67 +54,98 @@ class UzbekSTT:
             messages = [
                 (
                     "system",
-                    """Siz o‘zbek va rus tillarini aralash holda ishlatgan matnlarni tahrir qilish bo‘yicha tilshunos yordamchisiz. Sizga yuboriladigan matnda o‘zbekcha (lotin yozuvida) so‘zlar bilan bir qatorda ruscha so‘zlar ham bo‘lishi mumkin (ba’zida noto‘g‘ri yozilgan yoki lotin yozuvida). Sizning vazifangiz quyidagilardan iborat:
-                        1. O‘zbekcha jumlalarni grammatik jihatdan to‘g‘rilang, lekin ma’noni o‘zgartirmang.
-                        2. Ruscha so‘z yoki iboralarni aniqlang va ularni to‘g‘ri kirill yozuvida yozing.
-                        3. Butun jumlani o‘zbek va rus tillarini aralash ishlatadigan kishi uchun tabiiy va ravon holga keltiring.
-                        4. So‘zlarni tarjima qilmang – faqat grammatik tuzatish va yozuv (lotin → kirill) o‘zgarishini bajaring.
-                        5. Ism, joy nomlari, yoki madaniy terminlarga tegmang, ularni o‘z holida qoldiring.""",
                 ),
-                ("user", text),
             ]
             response = self.groq_client.invoke(messages)
-            return response.content.strip()
         except Exception as e:
             logger.error(f"Grammar correction failed: {str(e)}")
             return text
     def transcribe(self, audio_file: Union[str, BinaryIO]) -> str:
-        """Transcribe Uzbek speech to text with grammar correction."""
         try:
             logger.info("Processing audio file...")
             audio, _ = librosa.load(audio_file, sr=16000)
             input_values = self.processor(
                 audio, return_tensors="pt", padding="longest", sampling_rate=16000
             ).input_values
             with torch.no_grad():
                 logits = self.model(input_values).logits
                 predicted_ids = torch.argmax(logits, dim=-1)
-            transcription = self.processor.batch_decode(predicted_ids)[0].lower().strip()
             logger.info("Applying grammar correction...")
             corrected_text = self.correct_grammar(transcription)
             return corrected_text
         except Exception as e:
             logger.error(f"Transcription failed: {str(e)}")
             raise
 # ----------------- Streamlit App ----------------- #
 def main():
     st.set_page_config(
         page_title="Uzbek STT with Grammar Correction",
         page_icon="🗣️",
         layout="centered",
     )
     st.markdown(
         """
         <style>
         .stButton>button {
             background-color: #4CAF50;
             color: white;
             padding: 10px 24px;
             border: none;
             border-radius: 4px;
-            font-size: 16px;
             cursor: pointer;
         }
         .stButton>button:hover {
             background-color: #45a049;
@@ -134,39 +160,50 @@ def main():
         unsafe_allow_html=True,
     )
     st.markdown(
         "<h1 class='header'>🗣️ Uzbek Speech-to-Text & Grammar Correction</h1>",
         unsafe_allow_html=True,
     )
     st.markdown(
-        "Upload an Uzbek audio file. The model will transcribe it and fix grammar errors, converting Russian parts into Cyrillic where needed."
     )
     uploaded_file = st.file_uploader(
-        "Upload an audio file", type=["wav", "mp3", "m4a", "ogg"]
     )
     if uploaded_file is not None:
         st.audio(uploaded_file, format="audio/wav")
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
-            tmp.write(uploaded_file.read())
-            temp_audio_path = tmp.name
         if st.button("Transcribe"):
-            with st.spinner("Transcribing and correcting..."):
                 try:
                     uzbek_stt = UzbekSTT()
                     transcription = uzbek_stt.transcribe(temp_audio_path)
-                    st.success("Done!")
-                    st.markdown("### Transcribed & Corrected Text:")
                     st.write(transcription)
                 except Exception as e:
                     st.error(f"An error occurred: {str(e)}")
                 finally:
-                    os.remove(temp_audio_path)
 if __name__ == "__main__":
-    main()

 import torch
 import logging
 import librosa
 from typing import Union, BinaryIO
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 from langchain_groq import ChatGroq
 import streamlit as st
 from dotenv import load_dotenv
 load_dotenv()
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class UzbekSTT:
     """Enhanced Uzbek Speech-to-Text pipeline with grammar correction."""
+    # Set a class-level base model name
     base_model_name = "oyqiz/uzbek_stt"
     def __init__(self):
+        """Initialize the Uzbek STT pipeline with grammar correction."""
         self.processor = None
+        self.model = None
         self.groq_client = None
         self.load_models()
                 raise ValueError("GROQ_API_KEY environment variable is required")
             self.groq_client = ChatGroq(
+                model="llama-3.3-70b-versatile", temperature=0.3
             )
             logger.info("Models loaded successfully")
         except Exception as e:
             messages = [
                 (
                     "system",
+                    "Siz o'zbek tilida mutaxassissiz. Sizning vazifangiz berilgan o'zbek matnining grammatikasini to'g'rilash. Hech qanday izoh, tarjima yoki qo'shimcha ma'lumot bermang. Faqat to'g'rilangan o'zbek matnini qaytaring.",
                 ),
+                ("human", text),
             ]
             response = self.groq_client.invoke(messages)
+            return (
+                response.content.strip()
+                if hasattr(response, "content")
+                else str(response).strip()
+            )
         except Exception as e:
             logger.error(f"Grammar correction failed: {str(e)}")
             return text
     def transcribe(self, audio_file: Union[str, BinaryIO]) -> str:
+        """
+        Transcribe Uzbek speech to text with grammar correction.
+        Args:
+            audio_file: Path to audio file or file-like object
+        Returns:
+            str: Transcribed and grammar-corrected text
+        """
         try:
+            # Validate and load audio
+            if isinstance(audio_file, str) and not os.path.exists(audio_file):
+                raise FileNotFoundError(f"Audio file not found: {audio_file}")
             logger.info("Processing audio file...")
             audio, _ = librosa.load(audio_file, sr=16000)
             input_values = self.processor(
                 audio, return_tensors="pt", padding="longest", sampling_rate=16000
             ).input_values
+            # Generate transcription
             with torch.no_grad():
                 logits = self.model(input_values).logits
                 predicted_ids = torch.argmax(logits, dim=-1)
+            transcription = self.processor.batch_decode(predicted_ids)[0]
+            # Apply grammar correction
             logger.info("Applying grammar correction...")
             corrected_text = self.correct_grammar(transcription)
             return corrected_text
         except Exception as e:
             logger.error(f"Transcription failed: {str(e)}")
             raise
+    @classmethod
+    def from_pretrained(cls, model_name: str = "mustafoyev202/uzbek_stt"):
+        """Factory method for 🤗 Transformers compatibility."""
+        if model_name != "mustafoyev202/uzbek_stt":
+            logger.warning(
+                f"Using base model {cls.base_model_name} regardless of specified model name"
+            )
+        return cls()
 # ----------------- Streamlit App ----------------- #
 def main():
+    # Set Streamlit page configuration
     st.set_page_config(
         page_title="Uzbek STT with Grammar Correction",
         page_icon="🗣️",
         layout="centered",
+        initial_sidebar_state="auto",
     )
+    # Inject custom CSS for a modern, beautiful design
     st.markdown(
         """
         <style>
+        body {
+            background-color: #f0f2f6;
+        }
+        .main {
+            font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
+        }
         .stButton>button {
             background-color: #4CAF50;
             color: white;
             padding: 10px 24px;
             border: none;
             border-radius: 4px;
             cursor: pointer;
+            font-size: 16px;
         }
         .stButton>button:hover {
             background-color: #45a049;
         unsafe_allow_html=True,
     )
+    # App header
     st.markdown(
         "<h1 class='header'>🗣️ Uzbek Speech-to-Text & Grammar Correction</h1>",
         unsafe_allow_html=True,
     )
     st.markdown(
+        """
+        Welcome to the **Uzbek STT** application, where cutting-edge technology meets
+        linguistic precision. Upload an Uzbek audio file, and let our model transcribe and
+        correct your text in real time!
+        """
     )
+    # File uploader for audio files
     uploaded_file = st.file_uploader(
+        "Upload your Uzbek audio file", type=["wav", "mp3", "m4a", "ogg"]
     )
     if uploaded_file is not None:
+        # Display an audio player for the uploaded file
         st.audio(uploaded_file, format="audio/wav")
+        # Save the uploaded file to a temporary file
+        temp_audio_path = "temp_audio.wav"
+        with open(temp_audio_path, "wb") as f:
+            f.write(uploaded_file.read())
         if st.button("Transcribe"):
+            with st.spinner("Processing your audio file..."):
                 try:
+                    # Initialize the UzbekSTT pipeline
                     uzbek_stt = UzbekSTT()
+                    # Transcribe and correct the audio
                     transcription = uzbek_stt.transcribe(temp_audio_path)
+                    st.success("Transcription complete!")
+                    st.markdown("### Transcribed Text:")
                     st.write(transcription)
                 except Exception as e:
                     st.error(f"An error occurred: {str(e)}")
                 finally:
+                    # Clean up the temporary audio file
+                    if os.path.exists(temp_audio_path):
+                        os.remove(temp_audio_path)
 if __name__ == "__main__":
+    main()