Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,38 +1,952 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
#
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
transcription = model.transcribe([audio_path], batch_size=1, language_id='ks')[0]
|
24 |
-
|
25 |
-
return transcription
|
26 |
-
|
27 |
-
# Create the Gradio interface
|
28 |
-
iface = gr.Interface(
|
29 |
-
fn=transcribe_audio,
|
30 |
-
inputs=gr.Audio(type="filepath", label="Record or Upload Audio"), # Allows both record and upload
|
31 |
-
outputs=gr.Textbox(label="Transcription"),
|
32 |
-
live=True,
|
33 |
-
title="IndicConformer ASR Model for Kashmiri",
|
34 |
-
description="This model transcribes Kashmiri speech to text using the IndicConformer ASR model. You can either record or upload an audio file."
|
35 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
#
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TTS Dataset Collection Tool with Custom Fonts and Enhanced Features
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
import nltk
|
7 |
import gradio as gr
|
8 |
+
import uuid
|
9 |
+
from datetime import datetime
|
10 |
+
from pathlib import Path
|
11 |
+
import logging
|
12 |
+
from typing import Dict, Tuple, Optional
|
13 |
+
import traceback
|
14 |
+
import soundfile as sf
|
15 |
+
import re
|
16 |
+
|
17 |
+
# Download required NLTK data during initialization
|
18 |
+
try:
|
19 |
+
nltk.download('punkt') # Download punkt tokenizer data
|
20 |
+
nltk.data.find('tokenizers/punkt')
|
21 |
+
except Exception as e:
|
22 |
+
logger.warning(f"Error downloading NLTK data: {str(e)}")
|
23 |
+
logger.warning("NLTK tokenization might not work properly")
|
24 |
+
|
25 |
+
# Configure logging
|
26 |
+
logging.basicConfig(
|
27 |
+
level=logging.INFO,
|
28 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
)
|
30 |
+
logger = logging.getLogger(__name__)
|
31 |
+
|
32 |
+
# Font configurations
|
33 |
+
FONT_STYLES = {
|
34 |
+
"english_serif": {
|
35 |
+
"name": "Times New Roman",
|
36 |
+
"family": "Times New Roman",
|
37 |
+
"css": "font-family: 'Times New Roman', serif;"
|
38 |
+
},
|
39 |
+
"english_sans": {
|
40 |
+
"name": "Arial",
|
41 |
+
"family": "Arial",
|
42 |
+
"css": "font-family: Arial, sans-serif;"
|
43 |
+
},
|
44 |
+
"nastaliq": {
|
45 |
+
"name": "Nastaliq",
|
46 |
+
"family": "Noto Nastaliq Urdu",
|
47 |
+
"css": "font-family: 'Noto Nastaliq Urdu', serif;"
|
48 |
+
},
|
49 |
+
"naskh": {
|
50 |
+
"name": "Naskh",
|
51 |
+
"family": "Scheherazade New",
|
52 |
+
"css": "font-family: 'Scheherazade New', serif;"
|
53 |
+
}
|
54 |
+
}
|
55 |
+
|
56 |
+
|
57 |
+
class TTSDatasetCollector:
|
58 |
+
"""Manages TTS dataset collection and organization with enhanced features"""
|
59 |
+
|
60 |
+
def __init__(self):
|
61 |
+
"""Initialize the TTS Dataset Collector"""
|
62 |
+
# Handle both script and notebook environments for root path
|
63 |
+
try:
|
64 |
+
# When running as a script
|
65 |
+
self.root_path = Path(os.path.dirname(os.path.abspath(__file__))) / "dataset"
|
66 |
+
except NameError:
|
67 |
+
# When running in Jupyter/IPython
|
68 |
+
self.root_path = Path.cwd() / "dataset"
|
69 |
+
|
70 |
+
self.fonts_path = self.root_path / "fonts"
|
71 |
+
self.sentences = []
|
72 |
+
self.current_index = 0
|
73 |
+
self.current_font = "english_serif"
|
74 |
+
self.custom_fonts = {}
|
75 |
+
self.recordings = {} # Store recordings by sentence index
|
76 |
+
self.setup_directories()
|
77 |
+
|
78 |
+
# Ensure NLTK data is downloaded
|
79 |
+
try:
|
80 |
+
nltk.data.find('tokenizers/punkt')
|
81 |
+
except LookupError:
|
82 |
+
nltk.download('punkt', quiet=True)
|
83 |
+
|
84 |
+
logger.info("TTS Dataset Collector initialized")
|
85 |
+
|
86 |
+
def setup_directories(self) -> None:
|
87 |
+
"""Create necessary directory structure with logging"""
|
88 |
+
try:
|
89 |
+
# Create main dataset directory
|
90 |
+
self.root_path.mkdir(parents=True, exist_ok=True)
|
91 |
+
|
92 |
+
# Create subdirectories
|
93 |
+
for subdir in ['audio', 'transcriptions', 'metadata', 'fonts']:
|
94 |
+
(self.root_path / subdir).mkdir(parents=True, exist_ok=True)
|
95 |
+
|
96 |
+
# Initialize log file
|
97 |
+
log_file = self.root_path / 'dataset_log.txt'
|
98 |
+
if not log_file.exists():
|
99 |
+
with open(log_file, 'w', encoding='utf-8') as f:
|
100 |
+
f.write(f"Dataset collection initialized on {datetime.now().isoformat()}\n")
|
101 |
+
|
102 |
+
logger.info("Directory structure created successfully")
|
103 |
+
|
104 |
+
except Exception as e:
|
105 |
+
logger.error(f"Failed to create directory structure: {str(e)}")
|
106 |
+
logger.error(traceback.format_exc())
|
107 |
+
raise RuntimeError("Failed to initialize directory structure")
|
108 |
+
|
109 |
+
def log_operation(self, message: str, level: str = "info") -> None:
|
110 |
+
"""Log operations with timestamp and level"""
|
111 |
+
try:
|
112 |
+
log_file = self.root_path / 'dataset_log.txt'
|
113 |
+
timestamp = datetime.now().isoformat()
|
114 |
+
|
115 |
+
with open(log_file, 'a', encoding='utf-8') as f:
|
116 |
+
f.write(f"[{timestamp}] [{level.upper()}] {message}\n")
|
117 |
+
|
118 |
+
if level.lower() == "error":
|
119 |
+
logger.error(message)
|
120 |
+
else:
|
121 |
+
logger.info(message)
|
122 |
+
|
123 |
+
except Exception as e:
|
124 |
+
logger.error(f"Failed to log operation: {str(e)}")
|
125 |
+
|
126 |
+
def process_text(self, text: str) -> Tuple[bool, str]:
|
127 |
+
"""Process pasted or loaded text with error handling"""
|
128 |
+
try:
|
129 |
+
if not text.strip():
|
130 |
+
return False, "Text is empty"
|
131 |
+
|
132 |
+
# Simple sentence splitting as fallback
|
133 |
+
def simple_split_sentences(text):
|
134 |
+
# Split on common sentence endings
|
135 |
+
sentences = []
|
136 |
+
current = []
|
137 |
+
|
138 |
+
for line in text.split('\n'):
|
139 |
+
line = line.strip()
|
140 |
+
if not line:
|
141 |
+
continue
|
142 |
+
|
143 |
+
# Split on common sentence endings
|
144 |
+
parts = re.split(r'[.!?]', line)
|
145 |
+
for part in parts:
|
146 |
+
part = part.strip()
|
147 |
+
if part:
|
148 |
+
current.append(part)
|
149 |
+
sentences.append(' '.join(current))
|
150 |
+
current = []
|
151 |
+
|
152 |
+
if current:
|
153 |
+
sentences.append(' '.join(current))
|
154 |
+
|
155 |
+
return [s.strip() for s in sentences if s.strip()]
|
156 |
+
|
157 |
+
try:
|
158 |
+
# Try NLTK first
|
159 |
+
self.sentences = nltk.sent_tokenize(text.strip())
|
160 |
+
except Exception as e:
|
161 |
+
logger.warning(f"NLTK tokenization failed, falling back to simple splitting: {str(e)}")
|
162 |
+
# Fallback to simple splitting
|
163 |
+
self.sentences = simple_split_sentences(text.strip())
|
164 |
+
|
165 |
+
if not self.sentences:
|
166 |
+
return False, "No valid sentences found in text"
|
167 |
+
|
168 |
+
self.current_index = 0
|
169 |
+
|
170 |
+
# Log success
|
171 |
+
self.log_operation(f"Processed text with {len(self.sentences)} sentences")
|
172 |
+
return True, f"Successfully loaded {len(self.sentences)} sentences"
|
173 |
+
|
174 |
+
except Exception as e:
|
175 |
+
error_msg = f"Error processing text: {str(e)}"
|
176 |
+
self.log_operation(error_msg, "error")
|
177 |
+
logger.error(traceback.format_exc())
|
178 |
+
return False, error_msg
|
179 |
+
|
180 |
+
def load_text_file(self, file) -> Tuple[bool, str]:
|
181 |
+
"""Process and load text file with enhanced error handling"""
|
182 |
+
if not file:
|
183 |
+
return False, "No file provided"
|
184 |
+
|
185 |
+
try:
|
186 |
+
# Validate file extension
|
187 |
+
if not file.name.endswith('.txt'):
|
188 |
+
return False, "Only .txt files are supported"
|
189 |
+
|
190 |
+
text = file.read().decode('utf-8')
|
191 |
+
|
192 |
+
return self.process_text(text)
|
193 |
+
|
194 |
+
except UnicodeDecodeError:
|
195 |
+
error_msg = "File encoding error. Please ensure the file is UTF-8 encoded"
|
196 |
+
self.log_operation(error_msg, "error")
|
197 |
+
return False, error_msg
|
198 |
+
except Exception as e:
|
199 |
+
error_msg = f"Error loading file: {str(e)}"
|
200 |
+
self.log_operation(error_msg, "error")
|
201 |
+
logger.error(traceback.format_exc())
|
202 |
+
return False, error_msg
|
203 |
+
|
204 |
+
def get_styled_text(self, text: str) -> str:
|
205 |
+
"""Get text with current font styling"""
|
206 |
+
font_css = FONT_STYLES.get(self.current_font, {}).get('css', '')
|
207 |
+
return f"<div style='{font_css}'>{text}</div>"
|
208 |
+
|
209 |
+
def set_font(self, font_style: str) -> Tuple[bool, str]:
|
210 |
+
"""Set the current font style"""
|
211 |
+
if font_style not in FONT_STYLES and font_style not in self.custom_fonts:
|
212 |
+
available_fonts = ', '.join(list(FONT_STYLES.keys()) + list(self.custom_fonts.keys()))
|
213 |
+
return False, f"Invalid font style. Available styles: {available_fonts}"
|
214 |
+
self.current_font = font_style
|
215 |
+
return True, f"Font style set to {font_style}"
|
216 |
+
|
217 |
+
def add_custom_font(self, font_file_path) -> Tuple[bool, str]:
|
218 |
+
"""Add a custom font from the uploaded TTF file"""
|
219 |
+
try:
|
220 |
+
if not font_file_path:
|
221 |
+
return False, "No font file provided"
|
222 |
+
|
223 |
+
if not font_file_path.endswith('.ttf'):
|
224 |
+
return False, "Only .ttf font files are supported"
|
225 |
+
|
226 |
+
# Generate a unique font family name
|
227 |
+
font_family = f"font_{uuid.uuid4().hex[:8]}"
|
228 |
+
font_filename = font_family + '.ttf'
|
229 |
+
font_dest = self.fonts_path / font_filename
|
230 |
+
|
231 |
+
# Read and save the font file
|
232 |
+
with open(font_file_path, 'rb') as f_src, open(font_dest, 'wb') as f_dest:
|
233 |
+
f_dest.write(f_src.read())
|
234 |
+
|
235 |
+
# Add to custom fonts
|
236 |
+
self.custom_fonts[font_family] = {
|
237 |
+
'name': os.path.basename(font_file_path),
|
238 |
+
'family': font_family,
|
239 |
+
'css': f"font-family: '{font_family}', serif;"
|
240 |
+
}
|
241 |
+
|
242 |
+
# Update the FONT_STYLES with the custom font
|
243 |
+
FONT_STYLES[font_family] = self.custom_fonts[font_family]
|
244 |
+
|
245 |
+
# Log success
|
246 |
+
self.log_operation(f"Added custom font: {font_file_path} as {font_family}")
|
247 |
+
return True, f"Custom font '{os.path.basename(font_file_path)}' added successfully"
|
248 |
+
|
249 |
+
except Exception as e:
|
250 |
+
error_msg = f"Error adding custom font: {str(e)}"
|
251 |
+
self.log_operation(error_msg, "error")
|
252 |
+
logger.error(traceback.format_exc())
|
253 |
+
return False, error_msg
|
254 |
+
|
255 |
+
def generate_filenames(self, dataset_name: str, speaker_id: str, sentence_text: str) -> Tuple[str, str]:
|
256 |
+
"""Generate unique filenames for audio and text files"""
|
257 |
+
line_number = self.current_index + 1
|
258 |
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
259 |
+
|
260 |
+
# Sanitize strings for filenames
|
261 |
+
def sanitize_filename(s):
|
262 |
+
return re.sub(r'[^a-zA-Z0-9_-]', '_', s)[:50]
|
263 |
+
|
264 |
+
dataset_name_safe = sanitize_filename(dataset_name)
|
265 |
+
speaker_id_safe = sanitize_filename(speaker_id)
|
266 |
+
sentence_excerpt = sanitize_filename(sentence_text[:20])
|
267 |
+
base_name = f"{dataset_name_safe}_{speaker_id_safe}_line{line_number}_{sentence_excerpt}_{timestamp}"
|
268 |
+
return f"{base_name}.wav", f"{base_name}.txt"
|
269 |
+
|
270 |
+
def save_recording(self, audio_file, speaker_id: str, dataset_name: str) -> Tuple[bool, str, Dict]:
|
271 |
+
"""Save recording with enhanced error handling and logging"""
|
272 |
+
if not all([audio_file, speaker_id, dataset_name]):
|
273 |
+
missing = []
|
274 |
+
if not audio_file:
|
275 |
+
missing.append("audio recording")
|
276 |
+
if not speaker_id:
|
277 |
+
missing.append("speaker ID")
|
278 |
+
if not dataset_name:
|
279 |
+
missing.append("dataset name")
|
280 |
+
return False, f"Missing required information: {', '.join(missing)}", {}
|
281 |
+
|
282 |
+
# Check if sentences have been loaded
|
283 |
+
if not self.sentences:
|
284 |
+
return False, "No sentences have been loaded. Please load text before saving recordings.", {}
|
285 |
+
if self.current_index >= len(self.sentences):
|
286 |
+
return False, "Current sentence index is out of range.", {}
|
287 |
+
|
288 |
+
try:
|
289 |
+
# Validate inputs
|
290 |
+
if not speaker_id.strip().isalnum():
|
291 |
+
return False, "Speaker ID must contain only letters and numbers", {}
|
292 |
+
if not dataset_name.strip().isalnum():
|
293 |
+
return False, "Dataset name must contain only letters and numbers", {}
|
294 |
+
|
295 |
+
# Get current sentence text
|
296 |
+
sentence_text = self.sentences[self.current_index]
|
297 |
+
|
298 |
+
# Generate filenames
|
299 |
+
audio_name, text_name = self.generate_filenames(dataset_name, speaker_id, sentence_text)
|
300 |
+
|
301 |
+
# Create speaker directories
|
302 |
+
audio_dir = self.root_path / 'audio' / speaker_id
|
303 |
+
text_dir = self.root_path / 'transcriptions' / speaker_id
|
304 |
+
audio_dir.mkdir(parents=True, exist_ok=True)
|
305 |
+
text_dir.mkdir(parents=True, exist_ok=True)
|
306 |
+
|
307 |
+
# Save audio file
|
308 |
+
audio_path = audio_dir / audio_name
|
309 |
+
|
310 |
+
# Read the audio file using soundfile
|
311 |
+
audio_data, sampling_rate = sf.read(audio_file)
|
312 |
+
|
313 |
+
# Save audio file
|
314 |
+
sf.write(str(audio_path), audio_data, sampling_rate)
|
315 |
+
|
316 |
+
# Save transcription
|
317 |
+
text_path = text_dir / text_name
|
318 |
+
self.save_transcription(
|
319 |
+
text_path,
|
320 |
+
sentence_text,
|
321 |
+
{
|
322 |
+
'speaker_id': speaker_id,
|
323 |
+
'dataset_name': dataset_name,
|
324 |
+
'timestamp': datetime.now().isoformat(),
|
325 |
+
'audio_file': audio_name,
|
326 |
+
'font_style': self.current_font
|
327 |
+
}
|
328 |
+
)
|
329 |
+
|
330 |
+
# Update metadata
|
331 |
+
self.update_metadata(speaker_id, dataset_name)
|
332 |
+
|
333 |
+
# Store the recording
|
334 |
+
self.recordings[self.current_index] = {
|
335 |
+
'audio_file': audio_file,
|
336 |
+
'speaker_id': speaker_id,
|
337 |
+
'dataset_name': dataset_name,
|
338 |
+
'sentence': self.sentences[self.current_index]
|
339 |
+
}
|
340 |
+
|
341 |
+
# Log success
|
342 |
+
self.log_operation(
|
343 |
+
f"Saved recording: Speaker={speaker_id}, Dataset={dataset_name}, "
|
344 |
+
f"Audio={audio_name}, Text={text_name}"
|
345 |
+
)
|
346 |
+
|
347 |
+
return True, f"Recording saved successfully as {audio_name}", self.recordings
|
348 |
+
|
349 |
+
except Exception as e:
|
350 |
+
error_msg = f"Error saving recording: {str(e)}"
|
351 |
+
self.log_operation(error_msg, "error")
|
352 |
+
logger.error(traceback.format_exc())
|
353 |
+
return False, error_msg, self.recordings
|
354 |
+
|
355 |
+
def save_transcription(self, file_path: Path, text: str, metadata: Dict) -> None:
|
356 |
+
"""Save transcription with metadata"""
|
357 |
+
content = f"""[METADATA]
|
358 |
+
Recording_ID: {metadata['audio_file']}
|
359 |
+
Speaker_ID: {metadata['speaker_id']}
|
360 |
+
Dataset_Name: {metadata['dataset_name']}
|
361 |
+
Timestamp: {metadata['timestamp']}
|
362 |
+
Font_Style: {metadata['font_style']}
|
363 |
+
[TEXT]
|
364 |
+
{text}
|
365 |
+
"""
|
366 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
367 |
+
f.write(content)
|
368 |
+
|
369 |
+
def update_metadata(self, speaker_id: str, dataset_name: str) -> None:
|
370 |
+
"""Update dataset metadata with error handling"""
|
371 |
+
metadata_file = self.root_path / 'metadata' / 'dataset_info.json'
|
372 |
+
|
373 |
+
try:
|
374 |
+
if metadata_file.exists():
|
375 |
+
with open(metadata_file, 'r') as f:
|
376 |
+
metadata = json.load(f)
|
377 |
+
else:
|
378 |
+
metadata = {'speakers': {}, 'last_updated': None}
|
379 |
+
|
380 |
+
# Update speaker data
|
381 |
+
if speaker_id not in metadata['speakers']:
|
382 |
+
metadata['speakers'][speaker_id] = {
|
383 |
+
'total_recordings': 0,
|
384 |
+
'datasets': {}
|
385 |
+
}
|
386 |
+
|
387 |
+
if dataset_name not in metadata['speakers'][speaker_id]['datasets']:
|
388 |
+
metadata['speakers'][speaker_id]['datasets'][dataset_name] = {
|
389 |
+
'recordings': 0,
|
390 |
+
'sentences': len(self.sentences),
|
391 |
+
'recorded_sentences': [],
|
392 |
+
'first_recording': datetime.now().isoformat(),
|
393 |
+
'last_recording': None,
|
394 |
+
'font_styles_used': []
|
395 |
+
}
|
396 |
+
|
397 |
+
# Update counts and timestamps
|
398 |
+
metadata['speakers'][speaker_id]['total_recordings'] += 1
|
399 |
+
metadata['speakers'][speaker_id]['datasets'][dataset_name]['recordings'] += 1
|
400 |
+
metadata['speakers'][speaker_id]['datasets'][dataset_name]['last_recording'] = \
|
401 |
+
datetime.now().isoformat()
|
402 |
+
|
403 |
+
# Add current index to recorded sentences
|
404 |
+
if self.current_index not in metadata['speakers'][speaker_id]['datasets'][dataset_name]['recorded_sentences']:
|
405 |
+
metadata['speakers'][speaker_id]['datasets'][dataset_name]['recorded_sentences'].append(self.current_index)
|
406 |
+
|
407 |
+
# Update font styles
|
408 |
+
if self.current_font not in metadata['speakers'][speaker_id]['datasets'][dataset_name]['font_styles_used']:
|
409 |
+
metadata['speakers'][speaker_id]['datasets'][dataset_name]['font_styles_used'].append(
|
410 |
+
self.current_font
|
411 |
+
)
|
412 |
+
|
413 |
+
metadata['last_updated'] = datetime.now().isoformat()
|
414 |
+
|
415 |
+
# Save updated metadata
|
416 |
+
with open(metadata_file, 'w') as f:
|
417 |
+
json.dump(metadata, f, indent=2)
|
418 |
+
|
419 |
+
self.log_operation(f"Updated metadata for {speaker_id} in {dataset_name}")
|
420 |
+
|
421 |
+
except Exception as e:
|
422 |
+
error_msg = f"Error updating metadata: {str(e)}"
|
423 |
+
self.log_operation(error_msg, "error")
|
424 |
+
logger.error(traceback.format_exc())
|
425 |
+
|
426 |
+
def get_navigation_info(self) -> Dict[str, Optional[str]]:
|
427 |
+
"""Get current and next sentence information"""
|
428 |
+
if not self.sentences:
|
429 |
+
return {
|
430 |
+
'current': None,
|
431 |
+
'next': None,
|
432 |
+
'progress': "No text loaded"
|
433 |
+
}
|
434 |
+
|
435 |
+
current = self.get_styled_text(self.sentences[self.current_index])
|
436 |
+
next_text = None
|
437 |
+
|
438 |
+
if self.current_index < len(self.sentences) - 1:
|
439 |
+
next_text = self.get_styled_text(self.sentences[self.current_index + 1])
|
440 |
+
|
441 |
+
progress = f"Sentence {self.current_index + 1} of {len(self.sentences)}"
|
442 |
+
|
443 |
+
return {
|
444 |
+
'current': current,
|
445 |
+
'next': next_text,
|
446 |
+
'progress': progress
|
447 |
+
}
|
448 |
+
|
449 |
+
def navigate(self, direction: str) -> Dict[str, Optional[str]]:
|
450 |
+
"""Navigate through sentences"""
|
451 |
+
if not self.sentences:
|
452 |
+
return {
|
453 |
+
'current': None,
|
454 |
+
'next': None,
|
455 |
+
'progress': "No text loaded",
|
456 |
+
'status': "⚠️ Please load a text file first"
|
457 |
+
}
|
458 |
+
|
459 |
+
if direction == "next" and self.current_index < len(self.sentences) - 1:
|
460 |
+
self.current_index += 1
|
461 |
+
elif direction == "prev" and self.current_index > 0:
|
462 |
+
self.current_index -= 1
|
463 |
+
|
464 |
+
nav_info = self.get_navigation_info()
|
465 |
+
nav_info['status'] = "✅ Navigation successful"
|
466 |
+
|
467 |
+
return nav_info
|
468 |
+
|
469 |
+
def get_dataset_statistics(self) -> Dict:
|
470 |
+
"""Get current dataset statistics"""
|
471 |
+
try:
|
472 |
+
metadata_file = self.root_path / 'metadata' / 'dataset_info.json'
|
473 |
+
if not metadata_file.exists():
|
474 |
+
return {}
|
475 |
+
with open(metadata_file, 'r') as f:
|
476 |
+
metadata = json.load(f)
|
477 |
+
# Flatten statistics for display
|
478 |
+
total_sentences = len(self.sentences)
|
479 |
+
recorded = sum(len(dataset.get('recorded_sentences', [])) for speaker in metadata['speakers'].values() for dataset in speaker['datasets'].values())
|
480 |
+
remaining = total_sentences - recorded
|
481 |
+
stats = {
|
482 |
+
"Total Sentences": total_sentences,
|
483 |
+
"Recorded Sentences": recorded,
|
484 |
+
"Remaining Sentences": remaining,
|
485 |
+
"Last Updated": metadata.get('last_updated', 'N/A')
|
486 |
+
}
|
487 |
+
return stats
|
488 |
+
except Exception as e:
|
489 |
+
logger.error(f"Error reading dataset statistics: {str(e)}")
|
490 |
+
return {}
|
491 |
+
|
492 |
+
def get_last_audio_path(self, speaker_id: str) -> Optional[str]:
|
493 |
+
"""Get the path to the last saved audio file for downloading"""
|
494 |
+
audio_dir = self.root_path / 'audio' / speaker_id
|
495 |
+
audio_files = sorted(audio_dir.glob('*.wav'), key=lambda f: f.stat().st_mtime, reverse=True)
|
496 |
+
if audio_files:
|
497 |
+
return str(audio_files[0])
|
498 |
+
else:
|
499 |
+
return None
|
500 |
+
|
501 |
+
def get_last_transcript_path(self, speaker_id: str) -> Optional[str]:
|
502 |
+
"""Get the path to the last saved transcription file for downloading"""
|
503 |
+
text_dir = self.root_path / 'transcriptions' / speaker_id
|
504 |
+
text_files = sorted(text_dir.glob('*.txt'), key=lambda f: f.stat().st_mtime, reverse=True)
|
505 |
+
if text_files:
|
506 |
+
return str(text_files[0])
|
507 |
+
else:
|
508 |
+
return None
|
509 |
+
|
510 |
+
def create_zip_archive(self, speaker_id: str) -> Optional[str]:
|
511 |
+
"""Create a ZIP archive of all recordings and transcriptions for a speaker"""
|
512 |
+
try:
|
513 |
+
from zipfile import ZipFile
|
514 |
+
import tempfile
|
515 |
+
|
516 |
+
# Create temporary zip file
|
517 |
+
temp_dir = Path(tempfile.gettempdir())
|
518 |
+
zip_path = temp_dir / f"{speaker_id}_recordings.zip"
|
519 |
+
|
520 |
+
with ZipFile(zip_path, 'w') as zipf:
|
521 |
+
# Add audio files
|
522 |
+
audio_dir = self.root_path / 'audio' / speaker_id
|
523 |
+
if audio_dir.exists():
|
524 |
+
for audio_file in audio_dir.glob('*.wav'):
|
525 |
+
zipf.write(audio_file, f"audio/{audio_file.name}")
|
526 |
+
|
527 |
+
# Add transcription files
|
528 |
+
text_dir = self.root_path / 'transcriptions' / speaker_id
|
529 |
+
if text_dir.exists():
|
530 |
+
for text_file in text_dir.glob('*.txt'):
|
531 |
+
zipf.write(text_file, f"transcriptions/{text_file.name}")
|
532 |
+
|
533 |
+
return str(zip_path)
|
534 |
+
except Exception as e:
|
535 |
+
logger.error(f"Error creating zip archive: {str(e)}")
|
536 |
+
return None
|
537 |
+
|
538 |
+
|
539 |
+
def create_interface():
|
540 |
+
"""Create Gradio interface with enhanced features"""
|
541 |
+
|
542 |
+
collector = TTSDatasetCollector()
|
543 |
+
|
544 |
+
# Create custom CSS for fonts
|
545 |
+
custom_css = """
|
546 |
+
.gradio-container {
|
547 |
+
max-width: 1200px !important;
|
548 |
+
}
|
549 |
+
.record-button {
|
550 |
+
font-size: 1em !important;
|
551 |
+
padding: 10px !important;
|
552 |
+
}
|
553 |
+
.sentence-display {
|
554 |
+
font-size: 1.4em !important;
|
555 |
+
padding: 15px !important;
|
556 |
+
border: 1px solid #ddd !important;
|
557 |
+
border-radius: 8px !important;
|
558 |
+
margin: 10px 0 !important;
|
559 |
+
min-height: 100px !important;
|
560 |
+
}
|
561 |
+
.small-input {
|
562 |
+
max-width: 300px !important;
|
563 |
+
}
|
564 |
+
"""
|
565 |
+
|
566 |
+
# Include Google Fonts for Nastaliq and Naskh
|
567 |
+
google_fonts_css = """
|
568 |
+
@import url('https://fonts.googleapis.com/earlyaccess/notonastaliqurdu.css');
|
569 |
+
@import url('https://fonts.googleapis.com/css2?family=Scheherazade+New&display=swap');
|
570 |
+
"""
|
571 |
+
|
572 |
+
custom_css = google_fonts_css + custom_css
|
573 |
+
|
574 |
+
with gr.Blocks(title="TTS Dataset Collection Tool", css=custom_css) as interface:
|
575 |
+
gr.Markdown("# TTS Dataset Collection Tool")
|
576 |
+
|
577 |
+
status = gr.Textbox(
|
578 |
+
label="Status",
|
579 |
+
interactive=False,
|
580 |
+
max_lines=3,
|
581 |
+
elem_classes=["small-input"]
|
582 |
+
)
|
583 |
+
|
584 |
+
with gr.Row():
|
585 |
+
# Left column - Configuration and Input
|
586 |
+
with gr.Column(scale=1):
|
587 |
+
text_input = gr.Textbox(
|
588 |
+
label="Paste Text",
|
589 |
+
placeholder="Paste your text here...",
|
590 |
+
lines=5,
|
591 |
+
elem_classes=["small-input"],
|
592 |
+
interactive=True
|
593 |
+
)
|
594 |
+
file_input = gr.File(
|
595 |
+
label="Or Upload Text File (.txt)",
|
596 |
+
file_types=[".txt"],
|
597 |
+
elem_classes=["small-input"]
|
598 |
+
)
|
599 |
+
speaker_id = gr.Textbox(
|
600 |
+
label="Speaker ID",
|
601 |
+
placeholder="Enter unique speaker identifier (letters and numbers only)",
|
602 |
+
elem_classes=["small-input"]
|
603 |
+
)
|
604 |
+
dataset_name = gr.Textbox(
|
605 |
+
label="Dataset Name",
|
606 |
+
placeholder="Enter dataset name (letters and numbers only)",
|
607 |
+
elem_classes=["small-input"]
|
608 |
+
)
|
609 |
+
font_select = gr.Dropdown(
|
610 |
+
choices=list(FONT_STYLES.keys()),
|
611 |
+
value="english_serif",
|
612 |
+
label="Select Font Style",
|
613 |
+
elem_classes=["small-input"]
|
614 |
+
)
|
615 |
+
# Custom font upload
|
616 |
+
with gr.Accordion("Custom Font Upload", open=False):
|
617 |
+
font_file_input = gr.File(
|
618 |
+
label="Upload Custom Font (.ttf)",
|
619 |
+
file_types=[".ttf"],
|
620 |
+
elem_classes=["small-input"],
|
621 |
+
type="filepath"
|
622 |
+
)
|
623 |
+
add_font_btn = gr.Button("Add Custom Font")
|
624 |
+
|
625 |
+
# Dataset Info
|
626 |
+
with gr.Accordion("Dataset Statistics", open=False):
|
627 |
+
dataset_info = gr.JSON(
|
628 |
+
label="",
|
629 |
+
value={}
|
630 |
+
)
|
631 |
+
|
632 |
+
# Right column - Recording
|
633 |
+
with gr.Column(scale=2):
|
634 |
+
current_text = gr.HTML(
|
635 |
+
label="Current Sentence",
|
636 |
+
elem_classes=["sentence-display"]
|
637 |
+
)
|
638 |
+
next_text = gr.HTML(
|
639 |
+
label="Next Sentence",
|
640 |
+
elem_classes=["sentence-display"]
|
641 |
+
)
|
642 |
+
progress = gr.HTML("")
|
643 |
+
|
644 |
+
with gr.Row():
|
645 |
+
audio_recorder = gr.Audio(
|
646 |
+
label="Record Audio",
|
647 |
+
type="filepath",
|
648 |
+
elem_classes=["record-button"],
|
649 |
+
interactive=True,
|
650 |
+
streaming=False # Disable streaming to prevent freezing
|
651 |
+
)
|
652 |
+
clear_btn = gr.Button("Clear Recording", variant="secondary")
|
653 |
+
|
654 |
+
# Controls
|
655 |
+
with gr.Row():
|
656 |
+
prev_btn = gr.Button("Previous", variant="secondary")
|
657 |
+
save_btn = gr.Button("Save Recording", variant="primary")
|
658 |
+
next_btn = gr.Button("Next", variant="primary")
|
659 |
+
|
660 |
+
# Download Links
|
661 |
+
with gr.Row():
|
662 |
+
download_audio = gr.File(label="Download Last Audio", interactive=False)
|
663 |
+
download_transcript = gr.File(label="Download Last Transcript", interactive=False)
|
664 |
+
download_all = gr.File(label="Download All Recordings", interactive=False)
|
665 |
+
|
666 |
+
def download_all_recordings(speaker_id_value):
|
667 |
+
"""Handle downloading all recordings for a speaker"""
|
668 |
+
if not speaker_id_value:
|
669 |
+
return {
|
670 |
+
status: "⚠️ Please enter a Speaker ID first",
|
671 |
+
download_all: None
|
672 |
+
}
|
673 |
+
|
674 |
+
zip_path = collector.create_zip_archive(speaker_id_value)
|
675 |
+
if zip_path:
|
676 |
+
return {
|
677 |
+
status: "✅ Archive created successfully",
|
678 |
+
download_all: zip_path
|
679 |
+
}
|
680 |
+
return {
|
681 |
+
status: "❌ Failed to create archive",
|
682 |
+
download_all: None
|
683 |
+
}
|
684 |
+
|
685 |
+
# Add download all button and its event handler
|
686 |
+
download_all_btn = gr.Button("Download All Recordings", variant="secondary")
|
687 |
+
download_all_btn.click(
|
688 |
+
download_all_recordings,
|
689 |
+
inputs=[speaker_id],
|
690 |
+
outputs=[status, download_all]
|
691 |
+
)
|
692 |
+
|
693 |
+
# Add recordings display
|
694 |
+
with gr.Column(scale=2):
|
695 |
+
recordings_display = gr.HTML(
|
696 |
+
label="Saved Recordings",
|
697 |
+
value="<div id='recordings-list'></div>"
|
698 |
+
)
|
699 |
+
|
700 |
+
def process_pasted_text(text):
|
701 |
+
"""Handle pasted text input"""
|
702 |
+
if not text:
|
703 |
+
return {
|
704 |
+
current_text: "",
|
705 |
+
next_text: "",
|
706 |
+
progress: "",
|
707 |
+
status: "⚠️ No text provided",
|
708 |
+
dataset_info: collector.get_dataset_statistics()
|
709 |
+
}
|
710 |
+
|
711 |
+
success, msg = collector.process_text(text)
|
712 |
+
if not success:
|
713 |
+
return {
|
714 |
+
current_text: "",
|
715 |
+
next_text: "",
|
716 |
+
progress: "",
|
717 |
+
status: f"❌ {msg}",
|
718 |
+
dataset_info: collector.get_dataset_statistics()
|
719 |
+
}
|
720 |
+
|
721 |
+
nav_info = collector.get_navigation_info()
|
722 |
+
progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
|
723 |
+
return {
|
724 |
+
current_text: nav_info['current'],
|
725 |
+
next_text: nav_info['next'],
|
726 |
+
progress: progress_bar,
|
727 |
+
status: f"✅ {msg}",
|
728 |
+
dataset_info: collector.get_dataset_statistics()
|
729 |
+
}
|
730 |
+
|
731 |
+
def update_font(font_style):
|
732 |
+
"""Update font and refresh display"""
|
733 |
+
success, msg = collector.set_font(font_style)
|
734 |
+
if not success:
|
735 |
+
return {status: msg}
|
736 |
+
|
737 |
+
nav_info = collector.get_navigation_info()
|
738 |
+
return {
|
739 |
+
current_text: nav_info['current'],
|
740 |
+
next_text: nav_info['next'],
|
741 |
+
status: f"Font updated to {font_style}"
|
742 |
+
}
|
743 |
+
|
744 |
+
def load_file(file):
|
745 |
+
"""Handle file loading with enhanced error reporting"""
|
746 |
+
if not file:
|
747 |
+
return {
|
748 |
+
current_text: "",
|
749 |
+
next_text: "",
|
750 |
+
progress: "",
|
751 |
+
status: "⚠️ No file selected",
|
752 |
+
dataset_info: collector.get_dataset_statistics()
|
753 |
+
}
|
754 |
+
|
755 |
+
success, msg = collector.load_text_file(file)
|
756 |
+
if not success:
|
757 |
+
return {
|
758 |
+
current_text: "",
|
759 |
+
next_text: "",
|
760 |
+
progress: "",
|
761 |
+
status: f"❌ {msg}",
|
762 |
+
dataset_info: collector.get_dataset_statistics()
|
763 |
+
}
|
764 |
+
|
765 |
+
nav_info = collector.get_navigation_info()
|
766 |
+
progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
|
767 |
+
return {
|
768 |
+
current_text: nav_info['current'],
|
769 |
+
next_text: nav_info['next'],
|
770 |
+
progress: progress_bar,
|
771 |
+
status: f"✅ {msg}",
|
772 |
+
dataset_info: collector.get_dataset_statistics()
|
773 |
+
}
|
774 |
+
|
775 |
+
def save_current_recording(audio_file, speaker_id_value, dataset_name_value):
|
776 |
+
"""Handle saving the current recording"""
|
777 |
+
if not audio_file:
|
778 |
+
return {
|
779 |
+
status: "⚠️ Please record audio first",
|
780 |
+
download_audio: None,
|
781 |
+
download_transcript: None,
|
782 |
+
download_all: None,
|
783 |
+
recordings_display: "<div id='recordings-list'>No recordings yet</div>",
|
784 |
+
audio_recorder: None # Clear the recorder
|
785 |
+
}
|
786 |
+
|
787 |
+
success, msg, recordings = collector.save_recording(
|
788 |
+
audio_file, speaker_id_value, dataset_name_value
|
789 |
+
)
|
790 |
+
|
791 |
+
if not success:
|
792 |
+
return {
|
793 |
+
status: f"❌ {msg}",
|
794 |
+
dataset_info: collector.get_dataset_statistics(),
|
795 |
+
download_audio: None,
|
796 |
+
download_transcript: None,
|
797 |
+
download_all: None,
|
798 |
+
recordings_display: "<div id='recordings-list'>No recordings yet</div>"
|
799 |
+
}
|
800 |
+
|
801 |
+
# Get paths to the saved files
|
802 |
+
audio_path = collector.get_last_audio_path(speaker_id_value)
|
803 |
+
transcript_path = collector.get_last_transcript_path(speaker_id_value)
|
804 |
+
zip_path = collector.create_zip_archive(speaker_id_value)
|
805 |
+
|
806 |
+
# Auto-advance to next sentence after successful save
|
807 |
+
nav_info = collector.navigate("next")
|
808 |
+
progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
|
809 |
+
|
810 |
+
# Update recordings display
|
811 |
+
recordings_html = create_recordings_display(recordings)
|
812 |
+
|
813 |
+
result = {
|
814 |
+
current_text: nav_info['current'],
|
815 |
+
next_text: nav_info['next'],
|
816 |
+
progress: progress_bar,
|
817 |
+
status: f"✅ {msg}",
|
818 |
+
dataset_info: collector.get_dataset_statistics(),
|
819 |
+
download_audio: audio_path,
|
820 |
+
download_transcript: transcript_path,
|
821 |
+
download_all: zip_path,
|
822 |
+
recordings_display: recordings_html,
|
823 |
+
audio_recorder: None # Clear the recorder after successful save
|
824 |
+
}
|
825 |
+
return result
|
826 |
+
|
827 |
+
def create_recordings_display(recordings):
|
828 |
+
"""Create HTML display for recordings"""
|
829 |
+
recordings_html = "<div id='recordings-list'><h3>Saved Recordings:</h3>"
|
830 |
+
for idx, rec in recordings.items():
|
831 |
+
recordings_html += f"""
|
832 |
+
<div style='margin: 10px 0; padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>
|
833 |
+
<p><strong>Sentence {idx + 1}:</strong> {rec['sentence']}</p>
|
834 |
+
<audio controls src='{rec['audio_file']}'></audio>
|
835 |
+
</div>
|
836 |
+
"""
|
837 |
+
recordings_html += "</div>"
|
838 |
+
return recordings_html
|
839 |
+
|
840 |
+
def navigate_sentences(direction):
|
841 |
+
"""Handle navigation between sentences"""
|
842 |
+
nav_info = collector.navigate(direction)
|
843 |
+
progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
|
844 |
+
return {
|
845 |
+
current_text: nav_info['current'],
|
846 |
+
next_text: nav_info['next'],
|
847 |
+
progress: progress_bar,
|
848 |
+
status: nav_info['status']
|
849 |
+
}
|
850 |
+
|
851 |
+
def add_custom_font(font_file_path):
|
852 |
+
"""Handle adding a custom font"""
|
853 |
+
if not font_file_path:
|
854 |
+
return {
|
855 |
+
font_select: gr.update(),
|
856 |
+
status: "⚠️ No font file selected"
|
857 |
+
}
|
858 |
+
success, msg = collector.add_custom_font(font_file_path)
|
859 |
+
if not success:
|
860 |
+
return {
|
861 |
+
font_select: gr.update(),
|
862 |
+
status: f"❌ {msg}"
|
863 |
+
}
|
864 |
+
# Update font dropdown
|
865 |
+
font_choices = list(FONT_STYLES.keys()) + list(collector.custom_fonts.keys())
|
866 |
+
# Return updates to font_select and status
|
867 |
+
return {
|
868 |
+
font_select: gr.update(choices=font_choices),
|
869 |
+
status: f"✅ {msg}"
|
870 |
+
}
|
871 |
+
|
872 |
+
def clear_recording():
|
873 |
+
"""Clear the current recording"""
|
874 |
+
return {
|
875 |
+
audio_recorder: None,
|
876 |
+
status: "Recording cleared"
|
877 |
+
}
|
878 |
+
|
879 |
+
# Add clear button handler
|
880 |
+
clear_btn.click(
|
881 |
+
clear_recording,
|
882 |
+
outputs=[audio_recorder, status]
|
883 |
+
)
|
884 |
+
|
885 |
+
# Event handlers
|
886 |
+
text_input.change(
|
887 |
+
process_pasted_text,
|
888 |
+
inputs=[text_input],
|
889 |
+
outputs=[current_text, next_text, progress, status, dataset_info]
|
890 |
+
)
|
891 |
+
|
892 |
+
file_input.upload(
|
893 |
+
load_file,
|
894 |
+
inputs=[file_input],
|
895 |
+
outputs=[current_text, next_text, progress, status, dataset_info]
|
896 |
+
)
|
897 |
+
|
898 |
+
font_select.change(
|
899 |
+
update_font,
|
900 |
+
inputs=[font_select],
|
901 |
+
outputs=[current_text, next_text, status]
|
902 |
+
)
|
903 |
+
|
904 |
+
add_font_btn.click(
|
905 |
+
add_custom_font,
|
906 |
+
inputs=[font_file_input],
|
907 |
+
outputs=[font_select, status]
|
908 |
+
)
|
909 |
+
|
910 |
+
save_btn.click(
|
911 |
+
save_current_recording,
|
912 |
+
inputs=[audio_recorder, speaker_id, dataset_name],
|
913 |
+
outputs=[current_text, next_text, progress, status, dataset_info,
|
914 |
+
download_audio, download_transcript, download_all, recordings_display,
|
915 |
+
audio_recorder] # Add audio_recorder to outputs
|
916 |
+
)
|
917 |
+
|
918 |
+
prev_btn.click(
|
919 |
+
lambda: navigate_sentences("prev"),
|
920 |
+
outputs=[current_text, next_text, progress, status]
|
921 |
+
)
|
922 |
+
|
923 |
+
next_btn.click(
|
924 |
+
lambda: navigate_sentences("next"),
|
925 |
+
outputs=[current_text, next_text, progress, status]
|
926 |
+
)
|
927 |
+
|
928 |
+
# Initialize dataset info
|
929 |
+
dataset_info.value = collector.get_dataset_statistics()
|
930 |
+
|
931 |
+
return interface
|
932 |
+
|
933 |
+
if __name__ == "__main__":
|
934 |
+
try:
|
935 |
+
# Set up any required environment variables
|
936 |
+
os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
|
937 |
+
os.environ["GRADIO_SERVER_PORT"] = "7860"
|
938 |
|
939 |
+
# Create and launch the interface
|
940 |
+
interface = create_interface()
|
941 |
+
interface.queue() # Enable queuing for better handling of concurrent users
|
942 |
+
interface.launch(
|
943 |
+
server_name="0.0.0.0",
|
944 |
+
server_port=7860,
|
945 |
+
share=True,
|
946 |
+
debug=True,
|
947 |
+
show_error=True
|
948 |
+
)
|
949 |
+
except Exception as e:
|
950 |
+
logger.error(f"Failed to launch interface: {str(e)}")
|
951 |
+
logger.error(traceback.format_exc())
|
952 |
+
raise
|