Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,16 +2,15 @@ import os
|
|
2 |
import torch
|
3 |
import logging
|
4 |
import librosa
|
5 |
-
import tempfile
|
6 |
from typing import Union, BinaryIO
|
7 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
8 |
from langchain_groq import ChatGroq
|
9 |
import streamlit as st
|
10 |
from dotenv import load_dotenv
|
11 |
|
12 |
-
# Load environment variables
|
13 |
load_dotenv()
|
14 |
|
|
|
15 |
# Configure logging
|
16 |
logging.basicConfig(level=logging.INFO)
|
17 |
logger = logging.getLogger(__name__)
|
@@ -20,15 +19,13 @@ logger = logging.getLogger(__name__)
|
|
20 |
class UzbekSTT:
|
21 |
"""Enhanced Uzbek Speech-to-Text pipeline with grammar correction."""
|
22 |
|
|
|
23 |
base_model_name = "oyqiz/uzbek_stt"
|
24 |
|
25 |
def __init__(self):
|
|
|
26 |
self.processor = None
|
27 |
-
self.model =
|
28 |
-
self.base_model_name,
|
29 |
-
device_map="auto",
|
30 |
-
trust_remote_code=True # just in case the model repo has custom code
|
31 |
-
)
|
32 |
self.groq_client = None
|
33 |
self.load_models()
|
34 |
|
@@ -44,9 +41,7 @@ class UzbekSTT:
|
|
44 |
raise ValueError("GROQ_API_KEY environment variable is required")
|
45 |
|
46 |
self.groq_client = ChatGroq(
|
47 |
-
model="
|
48 |
-
temperature=0.3,
|
49 |
-
api_key=groq_api_key, # ✅ Make sure to pass API key here
|
50 |
)
|
51 |
logger.info("Models loaded successfully")
|
52 |
except Exception as e:
|
@@ -59,67 +54,98 @@ class UzbekSTT:
|
|
59 |
messages = [
|
60 |
(
|
61 |
"system",
|
62 |
-
"
|
63 |
-
|
64 |
-
1. O‘zbekcha jumlalarni grammatik jihatdan to‘g‘rilang, lekin ma’noni o‘zgartirmang.
|
65 |
-
2. Ruscha so‘z yoki iboralarni aniqlang va ularni to‘g‘ri kirill yozuvida yozing.
|
66 |
-
3. Butun jumlani o‘zbek va rus tillarini aralash ishlatadigan kishi uchun tabiiy va ravon holga keltiring.
|
67 |
-
4. So‘zlarni tarjima qilmang – faqat grammatik tuzatish va yozuv (lotin → kirill) o‘zgarishini bajaring.
|
68 |
-
5. Ism, joy nomlari, yoki madaniy terminlarga tegmang, ularni o‘z holida qoldiring.""",
|
69 |
),
|
70 |
-
("
|
71 |
]
|
72 |
response = self.groq_client.invoke(messages)
|
73 |
-
return
|
|
|
|
|
|
|
|
|
74 |
except Exception as e:
|
75 |
logger.error(f"Grammar correction failed: {str(e)}")
|
76 |
return text
|
77 |
|
78 |
def transcribe(self, audio_file: Union[str, BinaryIO]) -> str:
|
79 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
try:
|
|
|
|
|
|
|
|
|
81 |
logger.info("Processing audio file...")
|
82 |
audio, _ = librosa.load(audio_file, sr=16000)
|
83 |
-
|
84 |
input_values = self.processor(
|
85 |
audio, return_tensors="pt", padding="longest", sampling_rate=16000
|
86 |
).input_values
|
87 |
|
|
|
88 |
with torch.no_grad():
|
89 |
logits = self.model(input_values).logits
|
90 |
predicted_ids = torch.argmax(logits, dim=-1)
|
91 |
|
92 |
-
transcription = self.processor.batch_decode(predicted_ids)[0]
|
93 |
|
|
|
94 |
logger.info("Applying grammar correction...")
|
95 |
corrected_text = self.correct_grammar(transcription)
|
|
|
96 |
return corrected_text
|
97 |
|
98 |
except Exception as e:
|
99 |
logger.error(f"Transcription failed: {str(e)}")
|
100 |
raise
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
# ----------------- Streamlit App ----------------- #
|
104 |
|
|
|
105 |
def main():
|
|
|
106 |
st.set_page_config(
|
107 |
page_title="Uzbek STT with Grammar Correction",
|
108 |
page_icon="🗣️",
|
109 |
layout="centered",
|
|
|
110 |
)
|
111 |
|
|
|
112 |
st.markdown(
|
113 |
"""
|
114 |
<style>
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
.stButton>button {
|
116 |
background-color: #4CAF50;
|
117 |
color: white;
|
118 |
padding: 10px 24px;
|
119 |
border: none;
|
120 |
border-radius: 4px;
|
121 |
-
font-size: 16px;
|
122 |
cursor: pointer;
|
|
|
123 |
}
|
124 |
.stButton>button:hover {
|
125 |
background-color: #45a049;
|
@@ -134,39 +160,50 @@ def main():
|
|
134 |
unsafe_allow_html=True,
|
135 |
)
|
136 |
|
|
|
137 |
st.markdown(
|
138 |
"<h1 class='header'>🗣️ Uzbek Speech-to-Text & Grammar Correction</h1>",
|
139 |
unsafe_allow_html=True,
|
140 |
)
|
141 |
-
|
142 |
st.markdown(
|
143 |
-
"
|
|
|
|
|
|
|
|
|
144 |
)
|
145 |
|
|
|
146 |
uploaded_file = st.file_uploader(
|
147 |
-
"Upload
|
148 |
)
|
149 |
|
150 |
if uploaded_file is not None:
|
|
|
151 |
st.audio(uploaded_file, format="audio/wav")
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
156 |
|
157 |
if st.button("Transcribe"):
|
158 |
-
with st.spinner("
|
159 |
try:
|
|
|
160 |
uzbek_stt = UzbekSTT()
|
|
|
161 |
transcription = uzbek_stt.transcribe(temp_audio_path)
|
162 |
-
st.success("
|
163 |
-
st.markdown("### Transcribed
|
164 |
st.write(transcription)
|
165 |
except Exception as e:
|
166 |
st.error(f"An error occurred: {str(e)}")
|
167 |
finally:
|
168 |
-
|
|
|
|
|
169 |
|
170 |
|
171 |
if __name__ == "__main__":
|
172 |
-
main()
|
|
|
2 |
import torch
|
3 |
import logging
|
4 |
import librosa
|
|
|
5 |
from typing import Union, BinaryIO
|
6 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
7 |
from langchain_groq import ChatGroq
|
8 |
import streamlit as st
|
9 |
from dotenv import load_dotenv
|
10 |
|
|
|
11 |
load_dotenv()
|
12 |
|
13 |
+
|
14 |
# Configure logging
|
15 |
logging.basicConfig(level=logging.INFO)
|
16 |
logger = logging.getLogger(__name__)
|
|
|
19 |
class UzbekSTT:
|
20 |
"""Enhanced Uzbek Speech-to-Text pipeline with grammar correction."""
|
21 |
|
22 |
+
# Set a class-level base model name
|
23 |
base_model_name = "oyqiz/uzbek_stt"
|
24 |
|
25 |
def __init__(self):
|
26 |
+
"""Initialize the Uzbek STT pipeline with grammar correction."""
|
27 |
self.processor = None
|
28 |
+
self.model = None
|
|
|
|
|
|
|
|
|
29 |
self.groq_client = None
|
30 |
self.load_models()
|
31 |
|
|
|
41 |
raise ValueError("GROQ_API_KEY environment variable is required")
|
42 |
|
43 |
self.groq_client = ChatGroq(
|
44 |
+
model="llama-3.3-70b-versatile", temperature=0.3
|
|
|
|
|
45 |
)
|
46 |
logger.info("Models loaded successfully")
|
47 |
except Exception as e:
|
|
|
54 |
messages = [
|
55 |
(
|
56 |
"system",
|
57 |
+
"Siz o'zbek tilida mutaxassissiz. Sizning vazifangiz berilgan o'zbek matnining grammatikasini to'g'rilash. Hech qanday izoh, tarjima yoki qo'shimcha ma'lumot bermang. Faqat to'g'rilangan o'zbek matnini qaytaring.",
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
),
|
59 |
+
("human", text),
|
60 |
]
|
61 |
response = self.groq_client.invoke(messages)
|
62 |
+
return (
|
63 |
+
response.content.strip()
|
64 |
+
if hasattr(response, "content")
|
65 |
+
else str(response).strip()
|
66 |
+
)
|
67 |
except Exception as e:
|
68 |
logger.error(f"Grammar correction failed: {str(e)}")
|
69 |
return text
|
70 |
|
71 |
def transcribe(self, audio_file: Union[str, BinaryIO]) -> str:
|
72 |
+
"""
|
73 |
+
Transcribe Uzbek speech to text with grammar correction.
|
74 |
+
|
75 |
+
Args:
|
76 |
+
audio_file: Path to audio file or file-like object
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
str: Transcribed and grammar-corrected text
|
80 |
+
"""
|
81 |
try:
|
82 |
+
# Validate and load audio
|
83 |
+
if isinstance(audio_file, str) and not os.path.exists(audio_file):
|
84 |
+
raise FileNotFoundError(f"Audio file not found: {audio_file}")
|
85 |
+
|
86 |
logger.info("Processing audio file...")
|
87 |
audio, _ = librosa.load(audio_file, sr=16000)
|
|
|
88 |
input_values = self.processor(
|
89 |
audio, return_tensors="pt", padding="longest", sampling_rate=16000
|
90 |
).input_values
|
91 |
|
92 |
+
# Generate transcription
|
93 |
with torch.no_grad():
|
94 |
logits = self.model(input_values).logits
|
95 |
predicted_ids = torch.argmax(logits, dim=-1)
|
96 |
|
97 |
+
transcription = self.processor.batch_decode(predicted_ids)[0]
|
98 |
|
99 |
+
# Apply grammar correction
|
100 |
logger.info("Applying grammar correction...")
|
101 |
corrected_text = self.correct_grammar(transcription)
|
102 |
+
|
103 |
return corrected_text
|
104 |
|
105 |
except Exception as e:
|
106 |
logger.error(f"Transcription failed: {str(e)}")
|
107 |
raise
|
108 |
|
109 |
+
@classmethod
|
110 |
+
def from_pretrained(cls, model_name: str = "mustafoyev202/uzbek_stt"):
|
111 |
+
"""Factory method for 🤗 Transformers compatibility."""
|
112 |
+
if model_name != "mustafoyev202/uzbek_stt":
|
113 |
+
logger.warning(
|
114 |
+
f"Using base model {cls.base_model_name} regardless of specified model name"
|
115 |
+
)
|
116 |
+
return cls()
|
117 |
+
|
118 |
|
119 |
# ----------------- Streamlit App ----------------- #
|
120 |
|
121 |
+
|
122 |
def main():
|
123 |
+
# Set Streamlit page configuration
|
124 |
st.set_page_config(
|
125 |
page_title="Uzbek STT with Grammar Correction",
|
126 |
page_icon="🗣️",
|
127 |
layout="centered",
|
128 |
+
initial_sidebar_state="auto",
|
129 |
)
|
130 |
|
131 |
+
# Inject custom CSS for a modern, beautiful design
|
132 |
st.markdown(
|
133 |
"""
|
134 |
<style>
|
135 |
+
body {
|
136 |
+
background-color: #f0f2f6;
|
137 |
+
}
|
138 |
+
.main {
|
139 |
+
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
|
140 |
+
}
|
141 |
.stButton>button {
|
142 |
background-color: #4CAF50;
|
143 |
color: white;
|
144 |
padding: 10px 24px;
|
145 |
border: none;
|
146 |
border-radius: 4px;
|
|
|
147 |
cursor: pointer;
|
148 |
+
font-size: 16px;
|
149 |
}
|
150 |
.stButton>button:hover {
|
151 |
background-color: #45a049;
|
|
|
160 |
unsafe_allow_html=True,
|
161 |
)
|
162 |
|
163 |
+
# App header
|
164 |
st.markdown(
|
165 |
"<h1 class='header'>🗣️ Uzbek Speech-to-Text & Grammar Correction</h1>",
|
166 |
unsafe_allow_html=True,
|
167 |
)
|
|
|
168 |
st.markdown(
|
169 |
+
"""
|
170 |
+
Welcome to the **Uzbek STT** application, where cutting-edge technology meets
|
171 |
+
linguistic precision. Upload an Uzbek audio file, and let our model transcribe and
|
172 |
+
correct your text in real time!
|
173 |
+
"""
|
174 |
)
|
175 |
|
176 |
+
# File uploader for audio files
|
177 |
uploaded_file = st.file_uploader(
|
178 |
+
"Upload your Uzbek audio file", type=["wav", "mp3", "m4a", "ogg"]
|
179 |
)
|
180 |
|
181 |
if uploaded_file is not None:
|
182 |
+
# Display an audio player for the uploaded file
|
183 |
st.audio(uploaded_file, format="audio/wav")
|
184 |
|
185 |
+
# Save the uploaded file to a temporary file
|
186 |
+
temp_audio_path = "temp_audio.wav"
|
187 |
+
with open(temp_audio_path, "wb") as f:
|
188 |
+
f.write(uploaded_file.read())
|
189 |
|
190 |
if st.button("Transcribe"):
|
191 |
+
with st.spinner("Processing your audio file..."):
|
192 |
try:
|
193 |
+
# Initialize the UzbekSTT pipeline
|
194 |
uzbek_stt = UzbekSTT()
|
195 |
+
# Transcribe and correct the audio
|
196 |
transcription = uzbek_stt.transcribe(temp_audio_path)
|
197 |
+
st.success("Transcription complete!")
|
198 |
+
st.markdown("### Transcribed Text:")
|
199 |
st.write(transcription)
|
200 |
except Exception as e:
|
201 |
st.error(f"An error occurred: {str(e)}")
|
202 |
finally:
|
203 |
+
# Clean up the temporary audio file
|
204 |
+
if os.path.exists(temp_audio_path):
|
205 |
+
os.remove(temp_audio_path)
|
206 |
|
207 |
|
208 |
if __name__ == "__main__":
|
209 |
+
main()
|