mustafoyev202 commited on
Commit
e210e52
·
verified ·
1 Parent(s): 328d5e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -34
app.py CHANGED
@@ -2,16 +2,15 @@ import os
2
  import torch
3
  import logging
4
  import librosa
5
- import tempfile
6
  from typing import Union, BinaryIO
7
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
8
  from langchain_groq import ChatGroq
9
  import streamlit as st
10
  from dotenv import load_dotenv
11
 
12
- # Load environment variables
13
  load_dotenv()
14
 
 
15
  # Configure logging
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
@@ -20,15 +19,13 @@ logger = logging.getLogger(__name__)
20
  class UzbekSTT:
21
  """Enhanced Uzbek Speech-to-Text pipeline with grammar correction."""
22
 
 
23
  base_model_name = "oyqiz/uzbek_stt"
24
 
25
  def __init__(self):
 
26
  self.processor = None
27
- self.model = Wav2Vec2ForCTC.from_pretrained(
28
- self.base_model_name,
29
- device_map="auto",
30
- trust_remote_code=True # just in case the model repo has custom code
31
- )
32
  self.groq_client = None
33
  self.load_models()
34
 
@@ -44,9 +41,7 @@ class UzbekSTT:
44
  raise ValueError("GROQ_API_KEY environment variable is required")
45
 
46
  self.groq_client = ChatGroq(
47
- model="llama3-70b-8192", # Correct model name as per Groq
48
- temperature=0.3,
49
- api_key=groq_api_key, # ✅ Make sure to pass API key here
50
  )
51
  logger.info("Models loaded successfully")
52
  except Exception as e:
@@ -59,67 +54,98 @@ class UzbekSTT:
59
  messages = [
60
  (
61
  "system",
62
- """Siz ozbek va rus tillarini aralash holda ishlatgan matnlarni tahrir qilish bo‘yicha tilshunos yordamchisiz. Sizga yuboriladigan matnda o‘zbekcha (lotin yozuvida) so‘zlar bilan bir qatorda ruscha so‘zlar ham bo‘lishi mumkin (ba’zida noto‘g‘ri yozilgan yoki lotin yozuvida). Sizning vazifangiz quyidagilardan iborat:
63
-
64
- 1. O‘zbekcha jumlalarni grammatik jihatdan to‘g‘rilang, lekin ma’noni o‘zgartirmang.
65
- 2. Ruscha so‘z yoki iboralarni aniqlang va ularni to‘g‘ri kirill yozuvida yozing.
66
- 3. Butun jumlani o‘zbek va rus tillarini aralash ishlatadigan kishi uchun tabiiy va ravon holga keltiring.
67
- 4. So‘zlarni tarjima qilmang – faqat grammatik tuzatish va yozuv (lotin → kirill) o‘zgarishini bajaring.
68
- 5. Ism, joy nomlari, yoki madaniy terminlarga tegmang, ularni o‘z holida qoldiring.""",
69
  ),
70
- ("user", text),
71
  ]
72
  response = self.groq_client.invoke(messages)
73
- return response.content.strip()
 
 
 
 
74
  except Exception as e:
75
  logger.error(f"Grammar correction failed: {str(e)}")
76
  return text
77
 
78
  def transcribe(self, audio_file: Union[str, BinaryIO]) -> str:
79
- """Transcribe Uzbek speech to text with grammar correction."""
 
 
 
 
 
 
 
 
80
  try:
 
 
 
 
81
  logger.info("Processing audio file...")
82
  audio, _ = librosa.load(audio_file, sr=16000)
83
-
84
  input_values = self.processor(
85
  audio, return_tensors="pt", padding="longest", sampling_rate=16000
86
  ).input_values
87
 
 
88
  with torch.no_grad():
89
  logits = self.model(input_values).logits
90
  predicted_ids = torch.argmax(logits, dim=-1)
91
 
92
- transcription = self.processor.batch_decode(predicted_ids)[0].lower().strip()
93
 
 
94
  logger.info("Applying grammar correction...")
95
  corrected_text = self.correct_grammar(transcription)
 
96
  return corrected_text
97
 
98
  except Exception as e:
99
  logger.error(f"Transcription failed: {str(e)}")
100
  raise
101
 
 
 
 
 
 
 
 
 
 
102
 
103
  # ----------------- Streamlit App ----------------- #
104
 
 
105
  def main():
 
106
  st.set_page_config(
107
  page_title="Uzbek STT with Grammar Correction",
108
  page_icon="🗣️",
109
  layout="centered",
 
110
  )
111
 
 
112
  st.markdown(
113
  """
114
  <style>
 
 
 
 
 
 
115
  .stButton>button {
116
  background-color: #4CAF50;
117
  color: white;
118
  padding: 10px 24px;
119
  border: none;
120
  border-radius: 4px;
121
- font-size: 16px;
122
  cursor: pointer;
 
123
  }
124
  .stButton>button:hover {
125
  background-color: #45a049;
@@ -134,39 +160,50 @@ def main():
134
  unsafe_allow_html=True,
135
  )
136
 
 
137
  st.markdown(
138
  "<h1 class='header'>🗣️ Uzbek Speech-to-Text & Grammar Correction</h1>",
139
  unsafe_allow_html=True,
140
  )
141
-
142
  st.markdown(
143
- "Upload an Uzbek audio file. The model will transcribe it and fix grammar errors, converting Russian parts into Cyrillic where needed."
 
 
 
 
144
  )
145
 
 
146
  uploaded_file = st.file_uploader(
147
- "Upload an audio file", type=["wav", "mp3", "m4a", "ogg"]
148
  )
149
 
150
  if uploaded_file is not None:
 
151
  st.audio(uploaded_file, format="audio/wav")
152
 
153
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
154
- tmp.write(uploaded_file.read())
155
- temp_audio_path = tmp.name
 
156
 
157
  if st.button("Transcribe"):
158
- with st.spinner("Transcribing and correcting..."):
159
  try:
 
160
  uzbek_stt = UzbekSTT()
 
161
  transcription = uzbek_stt.transcribe(temp_audio_path)
162
- st.success("Done!")
163
- st.markdown("### Transcribed & Corrected Text:")
164
  st.write(transcription)
165
  except Exception as e:
166
  st.error(f"An error occurred: {str(e)}")
167
  finally:
168
- os.remove(temp_audio_path)
 
 
169
 
170
 
171
  if __name__ == "__main__":
172
- main()
 
2
  import torch
3
  import logging
4
  import librosa
 
5
  from typing import Union, BinaryIO
6
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
7
  from langchain_groq import ChatGroq
8
  import streamlit as st
9
  from dotenv import load_dotenv
10
 
 
11
  load_dotenv()
12
 
13
+
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
 
19
  class UzbekSTT:
20
  """Enhanced Uzbek Speech-to-Text pipeline with grammar correction."""
21
 
22
+ # Set a class-level base model name
23
  base_model_name = "oyqiz/uzbek_stt"
24
 
25
  def __init__(self):
26
+ """Initialize the Uzbek STT pipeline with grammar correction."""
27
  self.processor = None
28
+ self.model = None
 
 
 
 
29
  self.groq_client = None
30
  self.load_models()
31
 
 
41
  raise ValueError("GROQ_API_KEY environment variable is required")
42
 
43
  self.groq_client = ChatGroq(
44
+ model="llama-3.3-70b-versatile", temperature=0.3
 
 
45
  )
46
  logger.info("Models loaded successfully")
47
  except Exception as e:
 
54
  messages = [
55
  (
56
  "system",
57
+ "Siz o'zbek tilida mutaxassissiz. Sizning vazifangiz berilgan o'zbek matnining grammatikasini to'g'rilash. Hech qanday izoh, tarjima yoki qo'shimcha ma'lumot bermang. Faqat to'g'rilangan o'zbek matnini qaytaring.",
 
 
 
 
 
 
58
  ),
59
+ ("human", text),
60
  ]
61
  response = self.groq_client.invoke(messages)
62
+ return (
63
+ response.content.strip()
64
+ if hasattr(response, "content")
65
+ else str(response).strip()
66
+ )
67
  except Exception as e:
68
  logger.error(f"Grammar correction failed: {str(e)}")
69
  return text
70
 
71
  def transcribe(self, audio_file: Union[str, BinaryIO]) -> str:
72
+ """
73
+ Transcribe Uzbek speech to text with grammar correction.
74
+
75
+ Args:
76
+ audio_file: Path to audio file or file-like object
77
+
78
+ Returns:
79
+ str: Transcribed and grammar-corrected text
80
+ """
81
  try:
82
+ # Validate and load audio
83
+ if isinstance(audio_file, str) and not os.path.exists(audio_file):
84
+ raise FileNotFoundError(f"Audio file not found: {audio_file}")
85
+
86
  logger.info("Processing audio file...")
87
  audio, _ = librosa.load(audio_file, sr=16000)
 
88
  input_values = self.processor(
89
  audio, return_tensors="pt", padding="longest", sampling_rate=16000
90
  ).input_values
91
 
92
+ # Generate transcription
93
  with torch.no_grad():
94
  logits = self.model(input_values).logits
95
  predicted_ids = torch.argmax(logits, dim=-1)
96
 
97
+ transcription = self.processor.batch_decode(predicted_ids)[0]
98
 
99
+ # Apply grammar correction
100
  logger.info("Applying grammar correction...")
101
  corrected_text = self.correct_grammar(transcription)
102
+
103
  return corrected_text
104
 
105
  except Exception as e:
106
  logger.error(f"Transcription failed: {str(e)}")
107
  raise
108
 
109
+ @classmethod
110
+ def from_pretrained(cls, model_name: str = "mustafoyev202/uzbek_stt"):
111
+ """Factory method for 🤗 Transformers compatibility."""
112
+ if model_name != "mustafoyev202/uzbek_stt":
113
+ logger.warning(
114
+ f"Using base model {cls.base_model_name} regardless of specified model name"
115
+ )
116
+ return cls()
117
+
118
 
119
  # ----------------- Streamlit App ----------------- #
120
 
121
+
122
  def main():
123
+ # Set Streamlit page configuration
124
  st.set_page_config(
125
  page_title="Uzbek STT with Grammar Correction",
126
  page_icon="🗣️",
127
  layout="centered",
128
+ initial_sidebar_state="auto",
129
  )
130
 
131
+ # Inject custom CSS for a modern, beautiful design
132
  st.markdown(
133
  """
134
  <style>
135
+ body {
136
+ background-color: #f0f2f6;
137
+ }
138
+ .main {
139
+ font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
140
+ }
141
  .stButton>button {
142
  background-color: #4CAF50;
143
  color: white;
144
  padding: 10px 24px;
145
  border: none;
146
  border-radius: 4px;
 
147
  cursor: pointer;
148
+ font-size: 16px;
149
  }
150
  .stButton>button:hover {
151
  background-color: #45a049;
 
160
  unsafe_allow_html=True,
161
  )
162
 
163
+ # App header
164
  st.markdown(
165
  "<h1 class='header'>🗣️ Uzbek Speech-to-Text & Grammar Correction</h1>",
166
  unsafe_allow_html=True,
167
  )
 
168
  st.markdown(
169
+ """
170
+ Welcome to the **Uzbek STT** application, where cutting-edge technology meets
171
+ linguistic precision. Upload an Uzbek audio file, and let our model transcribe and
172
+ correct your text in real time!
173
+ """
174
  )
175
 
176
+ # File uploader for audio files
177
  uploaded_file = st.file_uploader(
178
+ "Upload your Uzbek audio file", type=["wav", "mp3", "m4a", "ogg"]
179
  )
180
 
181
  if uploaded_file is not None:
182
+ # Display an audio player for the uploaded file
183
  st.audio(uploaded_file, format="audio/wav")
184
 
185
+ # Save the uploaded file to a temporary file
186
+ temp_audio_path = "temp_audio.wav"
187
+ with open(temp_audio_path, "wb") as f:
188
+ f.write(uploaded_file.read())
189
 
190
  if st.button("Transcribe"):
191
+ with st.spinner("Processing your audio file..."):
192
  try:
193
+ # Initialize the UzbekSTT pipeline
194
  uzbek_stt = UzbekSTT()
195
+ # Transcribe and correct the audio
196
  transcription = uzbek_stt.transcribe(temp_audio_path)
197
+ st.success("Transcription complete!")
198
+ st.markdown("### Transcribed Text:")
199
  st.write(transcription)
200
  except Exception as e:
201
  st.error(f"An error occurred: {str(e)}")
202
  finally:
203
+ # Clean up the temporary audio file
204
+ if os.path.exists(temp_audio_path):
205
+ os.remove(temp_audio_path)
206
 
207
 
208
  if __name__ == "__main__":
209
+ main()