Omarrran commited on
Commit
5b7d395
·
verified ·
1 Parent(s): d330692

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +949 -35
app.py CHANGED
@@ -1,38 +1,952 @@
 
 
 
 
 
 
1
  import gradio as gr
2
- import torch
3
- import nemo.collections.asr as nemo_asr
4
-
5
- # Load the ASR model
6
- def load_model():
7
- model = nemo_asr.models.ASRModel.from_pretrained("ai4bharat/indicconformer_stt_ks_hybrid_ctc_rnnt_large")
8
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
- model.freeze() # Set the model in inference mode
10
- model = model.to(device)
11
- return model
12
-
13
- # Transcribe audio
14
- def transcribe_audio(audio):
15
- # Load the model (only once)
16
- model = load_model()
17
-
18
- # audio is the path to the uploaded file, use it directly
19
- audio_path = audio
20
-
21
- # Transcribe using the RNNT model
22
- model.cur_decoder = "rnnt"
23
- transcription = model.transcribe([audio_path], batch_size=1, language_id='ks')[0]
24
-
25
- return transcription
26
-
27
- # Create the Gradio interface
28
- iface = gr.Interface(
29
- fn=transcribe_audio,
30
- inputs=gr.Audio(type="filepath", label="Record or Upload Audio"), # Allows both record and upload
31
- outputs=gr.Textbox(label="Transcription"),
32
- live=True,
33
- title="IndicConformer ASR Model for Kashmiri",
34
- description="This model transcribes Kashmiri speech to text using the IndicConformer ASR model. You can either record or upload an audio file."
35
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- # Launch the Gradio interface
38
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TTS Dataset Collection Tool with Custom Fonts and Enhanced Features
3
+ """
4
+ import os
5
+ import json
6
+ import nltk
7
  import gradio as gr
8
+ import uuid
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ import logging
12
+ from typing import Dict, Tuple, Optional
13
+ import traceback
14
+ import soundfile as sf
15
+ import re
16
+
17
+ # Download required NLTK data during initialization
18
+ try:
19
+ nltk.download('punkt') # Download punkt tokenizer data
20
+ nltk.data.find('tokenizers/punkt')
21
+ except Exception as e:
22
+ logger.warning(f"Error downloading NLTK data: {str(e)}")
23
+ logger.warning("NLTK tokenization might not work properly")
24
+
25
+ # Configure logging
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format='%(asctime)s - %(levelname)s - %(message)s'
 
 
 
 
 
 
 
 
 
 
 
 
29
  )
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Font configurations
33
+ FONT_STYLES = {
34
+ "english_serif": {
35
+ "name": "Times New Roman",
36
+ "family": "Times New Roman",
37
+ "css": "font-family: 'Times New Roman', serif;"
38
+ },
39
+ "english_sans": {
40
+ "name": "Arial",
41
+ "family": "Arial",
42
+ "css": "font-family: Arial, sans-serif;"
43
+ },
44
+ "nastaliq": {
45
+ "name": "Nastaliq",
46
+ "family": "Noto Nastaliq Urdu",
47
+ "css": "font-family: 'Noto Nastaliq Urdu', serif;"
48
+ },
49
+ "naskh": {
50
+ "name": "Naskh",
51
+ "family": "Scheherazade New",
52
+ "css": "font-family: 'Scheherazade New', serif;"
53
+ }
54
+ }
55
+
56
+
57
+ class TTSDatasetCollector:
58
+ """Manages TTS dataset collection and organization with enhanced features"""
59
+
60
+ def __init__(self):
61
+ """Initialize the TTS Dataset Collector"""
62
+ # Handle both script and notebook environments for root path
63
+ try:
64
+ # When running as a script
65
+ self.root_path = Path(os.path.dirname(os.path.abspath(__file__))) / "dataset"
66
+ except NameError:
67
+ # When running in Jupyter/IPython
68
+ self.root_path = Path.cwd() / "dataset"
69
+
70
+ self.fonts_path = self.root_path / "fonts"
71
+ self.sentences = []
72
+ self.current_index = 0
73
+ self.current_font = "english_serif"
74
+ self.custom_fonts = {}
75
+ self.recordings = {} # Store recordings by sentence index
76
+ self.setup_directories()
77
+
78
+ # Ensure NLTK data is downloaded
79
+ try:
80
+ nltk.data.find('tokenizers/punkt')
81
+ except LookupError:
82
+ nltk.download('punkt', quiet=True)
83
+
84
+ logger.info("TTS Dataset Collector initialized")
85
+
86
+ def setup_directories(self) -> None:
87
+ """Create necessary directory structure with logging"""
88
+ try:
89
+ # Create main dataset directory
90
+ self.root_path.mkdir(parents=True, exist_ok=True)
91
+
92
+ # Create subdirectories
93
+ for subdir in ['audio', 'transcriptions', 'metadata', 'fonts']:
94
+ (self.root_path / subdir).mkdir(parents=True, exist_ok=True)
95
+
96
+ # Initialize log file
97
+ log_file = self.root_path / 'dataset_log.txt'
98
+ if not log_file.exists():
99
+ with open(log_file, 'w', encoding='utf-8') as f:
100
+ f.write(f"Dataset collection initialized on {datetime.now().isoformat()}\n")
101
+
102
+ logger.info("Directory structure created successfully")
103
+
104
+ except Exception as e:
105
+ logger.error(f"Failed to create directory structure: {str(e)}")
106
+ logger.error(traceback.format_exc())
107
+ raise RuntimeError("Failed to initialize directory structure")
108
+
109
+ def log_operation(self, message: str, level: str = "info") -> None:
110
+ """Log operations with timestamp and level"""
111
+ try:
112
+ log_file = self.root_path / 'dataset_log.txt'
113
+ timestamp = datetime.now().isoformat()
114
+
115
+ with open(log_file, 'a', encoding='utf-8') as f:
116
+ f.write(f"[{timestamp}] [{level.upper()}] {message}\n")
117
+
118
+ if level.lower() == "error":
119
+ logger.error(message)
120
+ else:
121
+ logger.info(message)
122
+
123
+ except Exception as e:
124
+ logger.error(f"Failed to log operation: {str(e)}")
125
+
126
+ def process_text(self, text: str) -> Tuple[bool, str]:
127
+ """Process pasted or loaded text with error handling"""
128
+ try:
129
+ if not text.strip():
130
+ return False, "Text is empty"
131
+
132
+ # Simple sentence splitting as fallback
133
+ def simple_split_sentences(text):
134
+ # Split on common sentence endings
135
+ sentences = []
136
+ current = []
137
+
138
+ for line in text.split('\n'):
139
+ line = line.strip()
140
+ if not line:
141
+ continue
142
+
143
+ # Split on common sentence endings
144
+ parts = re.split(r'[.!?]', line)
145
+ for part in parts:
146
+ part = part.strip()
147
+ if part:
148
+ current.append(part)
149
+ sentences.append(' '.join(current))
150
+ current = []
151
+
152
+ if current:
153
+ sentences.append(' '.join(current))
154
+
155
+ return [s.strip() for s in sentences if s.strip()]
156
+
157
+ try:
158
+ # Try NLTK first
159
+ self.sentences = nltk.sent_tokenize(text.strip())
160
+ except Exception as e:
161
+ logger.warning(f"NLTK tokenization failed, falling back to simple splitting: {str(e)}")
162
+ # Fallback to simple splitting
163
+ self.sentences = simple_split_sentences(text.strip())
164
+
165
+ if not self.sentences:
166
+ return False, "No valid sentences found in text"
167
+
168
+ self.current_index = 0
169
+
170
+ # Log success
171
+ self.log_operation(f"Processed text with {len(self.sentences)} sentences")
172
+ return True, f"Successfully loaded {len(self.sentences)} sentences"
173
+
174
+ except Exception as e:
175
+ error_msg = f"Error processing text: {str(e)}"
176
+ self.log_operation(error_msg, "error")
177
+ logger.error(traceback.format_exc())
178
+ return False, error_msg
179
+
180
+ def load_text_file(self, file) -> Tuple[bool, str]:
181
+ """Process and load text file with enhanced error handling"""
182
+ if not file:
183
+ return False, "No file provided"
184
+
185
+ try:
186
+ # Validate file extension
187
+ if not file.name.endswith('.txt'):
188
+ return False, "Only .txt files are supported"
189
+
190
+ text = file.read().decode('utf-8')
191
+
192
+ return self.process_text(text)
193
+
194
+ except UnicodeDecodeError:
195
+ error_msg = "File encoding error. Please ensure the file is UTF-8 encoded"
196
+ self.log_operation(error_msg, "error")
197
+ return False, error_msg
198
+ except Exception as e:
199
+ error_msg = f"Error loading file: {str(e)}"
200
+ self.log_operation(error_msg, "error")
201
+ logger.error(traceback.format_exc())
202
+ return False, error_msg
203
+
204
+ def get_styled_text(self, text: str) -> str:
205
+ """Get text with current font styling"""
206
+ font_css = FONT_STYLES.get(self.current_font, {}).get('css', '')
207
+ return f"<div style='{font_css}'>{text}</div>"
208
+
209
+ def set_font(self, font_style: str) -> Tuple[bool, str]:
210
+ """Set the current font style"""
211
+ if font_style not in FONT_STYLES and font_style not in self.custom_fonts:
212
+ available_fonts = ', '.join(list(FONT_STYLES.keys()) + list(self.custom_fonts.keys()))
213
+ return False, f"Invalid font style. Available styles: {available_fonts}"
214
+ self.current_font = font_style
215
+ return True, f"Font style set to {font_style}"
216
+
217
+ def add_custom_font(self, font_file_path) -> Tuple[bool, str]:
218
+ """Add a custom font from the uploaded TTF file"""
219
+ try:
220
+ if not font_file_path:
221
+ return False, "No font file provided"
222
+
223
+ if not font_file_path.endswith('.ttf'):
224
+ return False, "Only .ttf font files are supported"
225
+
226
+ # Generate a unique font family name
227
+ font_family = f"font_{uuid.uuid4().hex[:8]}"
228
+ font_filename = font_family + '.ttf'
229
+ font_dest = self.fonts_path / font_filename
230
+
231
+ # Read and save the font file
232
+ with open(font_file_path, 'rb') as f_src, open(font_dest, 'wb') as f_dest:
233
+ f_dest.write(f_src.read())
234
+
235
+ # Add to custom fonts
236
+ self.custom_fonts[font_family] = {
237
+ 'name': os.path.basename(font_file_path),
238
+ 'family': font_family,
239
+ 'css': f"font-family: '{font_family}', serif;"
240
+ }
241
+
242
+ # Update the FONT_STYLES with the custom font
243
+ FONT_STYLES[font_family] = self.custom_fonts[font_family]
244
+
245
+ # Log success
246
+ self.log_operation(f"Added custom font: {font_file_path} as {font_family}")
247
+ return True, f"Custom font '{os.path.basename(font_file_path)}' added successfully"
248
+
249
+ except Exception as e:
250
+ error_msg = f"Error adding custom font: {str(e)}"
251
+ self.log_operation(error_msg, "error")
252
+ logger.error(traceback.format_exc())
253
+ return False, error_msg
254
+
255
+ def generate_filenames(self, dataset_name: str, speaker_id: str, sentence_text: str) -> Tuple[str, str]:
256
+ """Generate unique filenames for audio and text files"""
257
+ line_number = self.current_index + 1
258
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
259
+
260
+ # Sanitize strings for filenames
261
+ def sanitize_filename(s):
262
+ return re.sub(r'[^a-zA-Z0-9_-]', '_', s)[:50]
263
+
264
+ dataset_name_safe = sanitize_filename(dataset_name)
265
+ speaker_id_safe = sanitize_filename(speaker_id)
266
+ sentence_excerpt = sanitize_filename(sentence_text[:20])
267
+ base_name = f"{dataset_name_safe}_{speaker_id_safe}_line{line_number}_{sentence_excerpt}_{timestamp}"
268
+ return f"{base_name}.wav", f"{base_name}.txt"
269
+
270
+ def save_recording(self, audio_file, speaker_id: str, dataset_name: str) -> Tuple[bool, str, Dict]:
271
+ """Save recording with enhanced error handling and logging"""
272
+ if not all([audio_file, speaker_id, dataset_name]):
273
+ missing = []
274
+ if not audio_file:
275
+ missing.append("audio recording")
276
+ if not speaker_id:
277
+ missing.append("speaker ID")
278
+ if not dataset_name:
279
+ missing.append("dataset name")
280
+ return False, f"Missing required information: {', '.join(missing)}", {}
281
+
282
+ # Check if sentences have been loaded
283
+ if not self.sentences:
284
+ return False, "No sentences have been loaded. Please load text before saving recordings.", {}
285
+ if self.current_index >= len(self.sentences):
286
+ return False, "Current sentence index is out of range.", {}
287
+
288
+ try:
289
+ # Validate inputs
290
+ if not speaker_id.strip().isalnum():
291
+ return False, "Speaker ID must contain only letters and numbers", {}
292
+ if not dataset_name.strip().isalnum():
293
+ return False, "Dataset name must contain only letters and numbers", {}
294
+
295
+ # Get current sentence text
296
+ sentence_text = self.sentences[self.current_index]
297
+
298
+ # Generate filenames
299
+ audio_name, text_name = self.generate_filenames(dataset_name, speaker_id, sentence_text)
300
+
301
+ # Create speaker directories
302
+ audio_dir = self.root_path / 'audio' / speaker_id
303
+ text_dir = self.root_path / 'transcriptions' / speaker_id
304
+ audio_dir.mkdir(parents=True, exist_ok=True)
305
+ text_dir.mkdir(parents=True, exist_ok=True)
306
+
307
+ # Save audio file
308
+ audio_path = audio_dir / audio_name
309
+
310
+ # Read the audio file using soundfile
311
+ audio_data, sampling_rate = sf.read(audio_file)
312
+
313
+ # Save audio file
314
+ sf.write(str(audio_path), audio_data, sampling_rate)
315
+
316
+ # Save transcription
317
+ text_path = text_dir / text_name
318
+ self.save_transcription(
319
+ text_path,
320
+ sentence_text,
321
+ {
322
+ 'speaker_id': speaker_id,
323
+ 'dataset_name': dataset_name,
324
+ 'timestamp': datetime.now().isoformat(),
325
+ 'audio_file': audio_name,
326
+ 'font_style': self.current_font
327
+ }
328
+ )
329
+
330
+ # Update metadata
331
+ self.update_metadata(speaker_id, dataset_name)
332
+
333
+ # Store the recording
334
+ self.recordings[self.current_index] = {
335
+ 'audio_file': audio_file,
336
+ 'speaker_id': speaker_id,
337
+ 'dataset_name': dataset_name,
338
+ 'sentence': self.sentences[self.current_index]
339
+ }
340
+
341
+ # Log success
342
+ self.log_operation(
343
+ f"Saved recording: Speaker={speaker_id}, Dataset={dataset_name}, "
344
+ f"Audio={audio_name}, Text={text_name}"
345
+ )
346
+
347
+ return True, f"Recording saved successfully as {audio_name}", self.recordings
348
+
349
+ except Exception as e:
350
+ error_msg = f"Error saving recording: {str(e)}"
351
+ self.log_operation(error_msg, "error")
352
+ logger.error(traceback.format_exc())
353
+ return False, error_msg, self.recordings
354
+
355
+ def save_transcription(self, file_path: Path, text: str, metadata: Dict) -> None:
356
+ """Save transcription with metadata"""
357
+ content = f"""[METADATA]
358
+ Recording_ID: {metadata['audio_file']}
359
+ Speaker_ID: {metadata['speaker_id']}
360
+ Dataset_Name: {metadata['dataset_name']}
361
+ Timestamp: {metadata['timestamp']}
362
+ Font_Style: {metadata['font_style']}
363
+ [TEXT]
364
+ {text}
365
+ """
366
+ with open(file_path, 'w', encoding='utf-8') as f:
367
+ f.write(content)
368
+
369
+ def update_metadata(self, speaker_id: str, dataset_name: str) -> None:
370
+ """Update dataset metadata with error handling"""
371
+ metadata_file = self.root_path / 'metadata' / 'dataset_info.json'
372
+
373
+ try:
374
+ if metadata_file.exists():
375
+ with open(metadata_file, 'r') as f:
376
+ metadata = json.load(f)
377
+ else:
378
+ metadata = {'speakers': {}, 'last_updated': None}
379
+
380
+ # Update speaker data
381
+ if speaker_id not in metadata['speakers']:
382
+ metadata['speakers'][speaker_id] = {
383
+ 'total_recordings': 0,
384
+ 'datasets': {}
385
+ }
386
+
387
+ if dataset_name not in metadata['speakers'][speaker_id]['datasets']:
388
+ metadata['speakers'][speaker_id]['datasets'][dataset_name] = {
389
+ 'recordings': 0,
390
+ 'sentences': len(self.sentences),
391
+ 'recorded_sentences': [],
392
+ 'first_recording': datetime.now().isoformat(),
393
+ 'last_recording': None,
394
+ 'font_styles_used': []
395
+ }
396
+
397
+ # Update counts and timestamps
398
+ metadata['speakers'][speaker_id]['total_recordings'] += 1
399
+ metadata['speakers'][speaker_id]['datasets'][dataset_name]['recordings'] += 1
400
+ metadata['speakers'][speaker_id]['datasets'][dataset_name]['last_recording'] = \
401
+ datetime.now().isoformat()
402
+
403
+ # Add current index to recorded sentences
404
+ if self.current_index not in metadata['speakers'][speaker_id]['datasets'][dataset_name]['recorded_sentences']:
405
+ metadata['speakers'][speaker_id]['datasets'][dataset_name]['recorded_sentences'].append(self.current_index)
406
+
407
+ # Update font styles
408
+ if self.current_font not in metadata['speakers'][speaker_id]['datasets'][dataset_name]['font_styles_used']:
409
+ metadata['speakers'][speaker_id]['datasets'][dataset_name]['font_styles_used'].append(
410
+ self.current_font
411
+ )
412
+
413
+ metadata['last_updated'] = datetime.now().isoformat()
414
+
415
+ # Save updated metadata
416
+ with open(metadata_file, 'w') as f:
417
+ json.dump(metadata, f, indent=2)
418
+
419
+ self.log_operation(f"Updated metadata for {speaker_id} in {dataset_name}")
420
+
421
+ except Exception as e:
422
+ error_msg = f"Error updating metadata: {str(e)}"
423
+ self.log_operation(error_msg, "error")
424
+ logger.error(traceback.format_exc())
425
+
426
+ def get_navigation_info(self) -> Dict[str, Optional[str]]:
427
+ """Get current and next sentence information"""
428
+ if not self.sentences:
429
+ return {
430
+ 'current': None,
431
+ 'next': None,
432
+ 'progress': "No text loaded"
433
+ }
434
+
435
+ current = self.get_styled_text(self.sentences[self.current_index])
436
+ next_text = None
437
+
438
+ if self.current_index < len(self.sentences) - 1:
439
+ next_text = self.get_styled_text(self.sentences[self.current_index + 1])
440
+
441
+ progress = f"Sentence {self.current_index + 1} of {len(self.sentences)}"
442
+
443
+ return {
444
+ 'current': current,
445
+ 'next': next_text,
446
+ 'progress': progress
447
+ }
448
+
449
+ def navigate(self, direction: str) -> Dict[str, Optional[str]]:
450
+ """Navigate through sentences"""
451
+ if not self.sentences:
452
+ return {
453
+ 'current': None,
454
+ 'next': None,
455
+ 'progress': "No text loaded",
456
+ 'status': "⚠️ Please load a text file first"
457
+ }
458
+
459
+ if direction == "next" and self.current_index < len(self.sentences) - 1:
460
+ self.current_index += 1
461
+ elif direction == "prev" and self.current_index > 0:
462
+ self.current_index -= 1
463
+
464
+ nav_info = self.get_navigation_info()
465
+ nav_info['status'] = "✅ Navigation successful"
466
+
467
+ return nav_info
468
+
469
+ def get_dataset_statistics(self) -> Dict:
470
+ """Get current dataset statistics"""
471
+ try:
472
+ metadata_file = self.root_path / 'metadata' / 'dataset_info.json'
473
+ if not metadata_file.exists():
474
+ return {}
475
+ with open(metadata_file, 'r') as f:
476
+ metadata = json.load(f)
477
+ # Flatten statistics for display
478
+ total_sentences = len(self.sentences)
479
+ recorded = sum(len(dataset.get('recorded_sentences', [])) for speaker in metadata['speakers'].values() for dataset in speaker['datasets'].values())
480
+ remaining = total_sentences - recorded
481
+ stats = {
482
+ "Total Sentences": total_sentences,
483
+ "Recorded Sentences": recorded,
484
+ "Remaining Sentences": remaining,
485
+ "Last Updated": metadata.get('last_updated', 'N/A')
486
+ }
487
+ return stats
488
+ except Exception as e:
489
+ logger.error(f"Error reading dataset statistics: {str(e)}")
490
+ return {}
491
+
492
+ def get_last_audio_path(self, speaker_id: str) -> Optional[str]:
493
+ """Get the path to the last saved audio file for downloading"""
494
+ audio_dir = self.root_path / 'audio' / speaker_id
495
+ audio_files = sorted(audio_dir.glob('*.wav'), key=lambda f: f.stat().st_mtime, reverse=True)
496
+ if audio_files:
497
+ return str(audio_files[0])
498
+ else:
499
+ return None
500
+
501
+ def get_last_transcript_path(self, speaker_id: str) -> Optional[str]:
502
+ """Get the path to the last saved transcription file for downloading"""
503
+ text_dir = self.root_path / 'transcriptions' / speaker_id
504
+ text_files = sorted(text_dir.glob('*.txt'), key=lambda f: f.stat().st_mtime, reverse=True)
505
+ if text_files:
506
+ return str(text_files[0])
507
+ else:
508
+ return None
509
+
510
+ def create_zip_archive(self, speaker_id: str) -> Optional[str]:
511
+ """Create a ZIP archive of all recordings and transcriptions for a speaker"""
512
+ try:
513
+ from zipfile import ZipFile
514
+ import tempfile
515
+
516
+ # Create temporary zip file
517
+ temp_dir = Path(tempfile.gettempdir())
518
+ zip_path = temp_dir / f"{speaker_id}_recordings.zip"
519
+
520
+ with ZipFile(zip_path, 'w') as zipf:
521
+ # Add audio files
522
+ audio_dir = self.root_path / 'audio' / speaker_id
523
+ if audio_dir.exists():
524
+ for audio_file in audio_dir.glob('*.wav'):
525
+ zipf.write(audio_file, f"audio/{audio_file.name}")
526
+
527
+ # Add transcription files
528
+ text_dir = self.root_path / 'transcriptions' / speaker_id
529
+ if text_dir.exists():
530
+ for text_file in text_dir.glob('*.txt'):
531
+ zipf.write(text_file, f"transcriptions/{text_file.name}")
532
+
533
+ return str(zip_path)
534
+ except Exception as e:
535
+ logger.error(f"Error creating zip archive: {str(e)}")
536
+ return None
537
+
538
+
539
+ def create_interface():
540
+ """Create Gradio interface with enhanced features"""
541
+
542
+ collector = TTSDatasetCollector()
543
+
544
+ # Create custom CSS for fonts
545
+ custom_css = """
546
+ .gradio-container {
547
+ max-width: 1200px !important;
548
+ }
549
+ .record-button {
550
+ font-size: 1em !important;
551
+ padding: 10px !important;
552
+ }
553
+ .sentence-display {
554
+ font-size: 1.4em !important;
555
+ padding: 15px !important;
556
+ border: 1px solid #ddd !important;
557
+ border-radius: 8px !important;
558
+ margin: 10px 0 !important;
559
+ min-height: 100px !important;
560
+ }
561
+ .small-input {
562
+ max-width: 300px !important;
563
+ }
564
+ """
565
+
566
+ # Include Google Fonts for Nastaliq and Naskh
567
+ google_fonts_css = """
568
+ @import url('https://fonts.googleapis.com/earlyaccess/notonastaliqurdu.css');
569
+ @import url('https://fonts.googleapis.com/css2?family=Scheherazade+New&display=swap');
570
+ """
571
+
572
+ custom_css = google_fonts_css + custom_css
573
+
574
+ with gr.Blocks(title="TTS Dataset Collection Tool", css=custom_css) as interface:
575
+ gr.Markdown("# TTS Dataset Collection Tool")
576
+
577
+ status = gr.Textbox(
578
+ label="Status",
579
+ interactive=False,
580
+ max_lines=3,
581
+ elem_classes=["small-input"]
582
+ )
583
+
584
+ with gr.Row():
585
+ # Left column - Configuration and Input
586
+ with gr.Column(scale=1):
587
+ text_input = gr.Textbox(
588
+ label="Paste Text",
589
+ placeholder="Paste your text here...",
590
+ lines=5,
591
+ elem_classes=["small-input"],
592
+ interactive=True
593
+ )
594
+ file_input = gr.File(
595
+ label="Or Upload Text File (.txt)",
596
+ file_types=[".txt"],
597
+ elem_classes=["small-input"]
598
+ )
599
+ speaker_id = gr.Textbox(
600
+ label="Speaker ID",
601
+ placeholder="Enter unique speaker identifier (letters and numbers only)",
602
+ elem_classes=["small-input"]
603
+ )
604
+ dataset_name = gr.Textbox(
605
+ label="Dataset Name",
606
+ placeholder="Enter dataset name (letters and numbers only)",
607
+ elem_classes=["small-input"]
608
+ )
609
+ font_select = gr.Dropdown(
610
+ choices=list(FONT_STYLES.keys()),
611
+ value="english_serif",
612
+ label="Select Font Style",
613
+ elem_classes=["small-input"]
614
+ )
615
+ # Custom font upload
616
+ with gr.Accordion("Custom Font Upload", open=False):
617
+ font_file_input = gr.File(
618
+ label="Upload Custom Font (.ttf)",
619
+ file_types=[".ttf"],
620
+ elem_classes=["small-input"],
621
+ type="filepath"
622
+ )
623
+ add_font_btn = gr.Button("Add Custom Font")
624
+
625
+ # Dataset Info
626
+ with gr.Accordion("Dataset Statistics", open=False):
627
+ dataset_info = gr.JSON(
628
+ label="",
629
+ value={}
630
+ )
631
+
632
+ # Right column - Recording
633
+ with gr.Column(scale=2):
634
+ current_text = gr.HTML(
635
+ label="Current Sentence",
636
+ elem_classes=["sentence-display"]
637
+ )
638
+ next_text = gr.HTML(
639
+ label="Next Sentence",
640
+ elem_classes=["sentence-display"]
641
+ )
642
+ progress = gr.HTML("")
643
+
644
+ with gr.Row():
645
+ audio_recorder = gr.Audio(
646
+ label="Record Audio",
647
+ type="filepath",
648
+ elem_classes=["record-button"],
649
+ interactive=True,
650
+ streaming=False # Disable streaming to prevent freezing
651
+ )
652
+ clear_btn = gr.Button("Clear Recording", variant="secondary")
653
+
654
+ # Controls
655
+ with gr.Row():
656
+ prev_btn = gr.Button("Previous", variant="secondary")
657
+ save_btn = gr.Button("Save Recording", variant="primary")
658
+ next_btn = gr.Button("Next", variant="primary")
659
+
660
+ # Download Links
661
+ with gr.Row():
662
+ download_audio = gr.File(label="Download Last Audio", interactive=False)
663
+ download_transcript = gr.File(label="Download Last Transcript", interactive=False)
664
+ download_all = gr.File(label="Download All Recordings", interactive=False)
665
+
666
+ def download_all_recordings(speaker_id_value):
667
+ """Handle downloading all recordings for a speaker"""
668
+ if not speaker_id_value:
669
+ return {
670
+ status: "⚠️ Please enter a Speaker ID first",
671
+ download_all: None
672
+ }
673
+
674
+ zip_path = collector.create_zip_archive(speaker_id_value)
675
+ if zip_path:
676
+ return {
677
+ status: "✅ Archive created successfully",
678
+ download_all: zip_path
679
+ }
680
+ return {
681
+ status: "❌ Failed to create archive",
682
+ download_all: None
683
+ }
684
+
685
+ # Add download all button and its event handler
686
+ download_all_btn = gr.Button("Download All Recordings", variant="secondary")
687
+ download_all_btn.click(
688
+ download_all_recordings,
689
+ inputs=[speaker_id],
690
+ outputs=[status, download_all]
691
+ )
692
+
693
+ # Add recordings display
694
+ with gr.Column(scale=2):
695
+ recordings_display = gr.HTML(
696
+ label="Saved Recordings",
697
+ value="<div id='recordings-list'></div>"
698
+ )
699
+
700
+ def process_pasted_text(text):
701
+ """Handle pasted text input"""
702
+ if not text:
703
+ return {
704
+ current_text: "",
705
+ next_text: "",
706
+ progress: "",
707
+ status: "⚠️ No text provided",
708
+ dataset_info: collector.get_dataset_statistics()
709
+ }
710
+
711
+ success, msg = collector.process_text(text)
712
+ if not success:
713
+ return {
714
+ current_text: "",
715
+ next_text: "",
716
+ progress: "",
717
+ status: f"❌ {msg}",
718
+ dataset_info: collector.get_dataset_statistics()
719
+ }
720
+
721
+ nav_info = collector.get_navigation_info()
722
+ progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
723
+ return {
724
+ current_text: nav_info['current'],
725
+ next_text: nav_info['next'],
726
+ progress: progress_bar,
727
+ status: f"✅ {msg}",
728
+ dataset_info: collector.get_dataset_statistics()
729
+ }
730
+
731
+ def update_font(font_style):
732
+ """Update font and refresh display"""
733
+ success, msg = collector.set_font(font_style)
734
+ if not success:
735
+ return {status: msg}
736
+
737
+ nav_info = collector.get_navigation_info()
738
+ return {
739
+ current_text: nav_info['current'],
740
+ next_text: nav_info['next'],
741
+ status: f"Font updated to {font_style}"
742
+ }
743
+
744
+ def load_file(file):
745
+ """Handle file loading with enhanced error reporting"""
746
+ if not file:
747
+ return {
748
+ current_text: "",
749
+ next_text: "",
750
+ progress: "",
751
+ status: "⚠️ No file selected",
752
+ dataset_info: collector.get_dataset_statistics()
753
+ }
754
+
755
+ success, msg = collector.load_text_file(file)
756
+ if not success:
757
+ return {
758
+ current_text: "",
759
+ next_text: "",
760
+ progress: "",
761
+ status: f"❌ {msg}",
762
+ dataset_info: collector.get_dataset_statistics()
763
+ }
764
+
765
+ nav_info = collector.get_navigation_info()
766
+ progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
767
+ return {
768
+ current_text: nav_info['current'],
769
+ next_text: nav_info['next'],
770
+ progress: progress_bar,
771
+ status: f"✅ {msg}",
772
+ dataset_info: collector.get_dataset_statistics()
773
+ }
774
+
775
+ def save_current_recording(audio_file, speaker_id_value, dataset_name_value):
776
+ """Handle saving the current recording"""
777
+ if not audio_file:
778
+ return {
779
+ status: "⚠️ Please record audio first",
780
+ download_audio: None,
781
+ download_transcript: None,
782
+ download_all: None,
783
+ recordings_display: "<div id='recordings-list'>No recordings yet</div>",
784
+ audio_recorder: None # Clear the recorder
785
+ }
786
+
787
+ success, msg, recordings = collector.save_recording(
788
+ audio_file, speaker_id_value, dataset_name_value
789
+ )
790
+
791
+ if not success:
792
+ return {
793
+ status: f"❌ {msg}",
794
+ dataset_info: collector.get_dataset_statistics(),
795
+ download_audio: None,
796
+ download_transcript: None,
797
+ download_all: None,
798
+ recordings_display: "<div id='recordings-list'>No recordings yet</div>"
799
+ }
800
+
801
+ # Get paths to the saved files
802
+ audio_path = collector.get_last_audio_path(speaker_id_value)
803
+ transcript_path = collector.get_last_transcript_path(speaker_id_value)
804
+ zip_path = collector.create_zip_archive(speaker_id_value)
805
+
806
+ # Auto-advance to next sentence after successful save
807
+ nav_info = collector.navigate("next")
808
+ progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
809
+
810
+ # Update recordings display
811
+ recordings_html = create_recordings_display(recordings)
812
+
813
+ result = {
814
+ current_text: nav_info['current'],
815
+ next_text: nav_info['next'],
816
+ progress: progress_bar,
817
+ status: f"✅ {msg}",
818
+ dataset_info: collector.get_dataset_statistics(),
819
+ download_audio: audio_path,
820
+ download_transcript: transcript_path,
821
+ download_all: zip_path,
822
+ recordings_display: recordings_html,
823
+ audio_recorder: None # Clear the recorder after successful save
824
+ }
825
+ return result
826
+
827
+ def create_recordings_display(recordings):
828
+ """Create HTML display for recordings"""
829
+ recordings_html = "<div id='recordings-list'><h3>Saved Recordings:</h3>"
830
+ for idx, rec in recordings.items():
831
+ recordings_html += f"""
832
+ <div style='margin: 10px 0; padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>
833
+ <p><strong>Sentence {idx + 1}:</strong> {rec['sentence']}</p>
834
+ <audio controls src='{rec['audio_file']}'></audio>
835
+ </div>
836
+ """
837
+ recordings_html += "</div>"
838
+ return recordings_html
839
+
840
+ def navigate_sentences(direction):
841
+ """Handle navigation between sentences"""
842
+ nav_info = collector.navigate(direction)
843
+ progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
844
+ return {
845
+ current_text: nav_info['current'],
846
+ next_text: nav_info['next'],
847
+ progress: progress_bar,
848
+ status: nav_info['status']
849
+ }
850
+
851
+ def add_custom_font(font_file_path):
852
+ """Handle adding a custom font"""
853
+ if not font_file_path:
854
+ return {
855
+ font_select: gr.update(),
856
+ status: "⚠️ No font file selected"
857
+ }
858
+ success, msg = collector.add_custom_font(font_file_path)
859
+ if not success:
860
+ return {
861
+ font_select: gr.update(),
862
+ status: f"❌ {msg}"
863
+ }
864
+ # Update font dropdown
865
+ font_choices = list(FONT_STYLES.keys()) + list(collector.custom_fonts.keys())
866
+ # Return updates to font_select and status
867
+ return {
868
+ font_select: gr.update(choices=font_choices),
869
+ status: f"✅ {msg}"
870
+ }
871
+
872
+ def clear_recording():
873
+ """Clear the current recording"""
874
+ return {
875
+ audio_recorder: None,
876
+ status: "Recording cleared"
877
+ }
878
+
879
+ # Add clear button handler
880
+ clear_btn.click(
881
+ clear_recording,
882
+ outputs=[audio_recorder, status]
883
+ )
884
+
885
+ # Event handlers
886
+ text_input.change(
887
+ process_pasted_text,
888
+ inputs=[text_input],
889
+ outputs=[current_text, next_text, progress, status, dataset_info]
890
+ )
891
+
892
+ file_input.upload(
893
+ load_file,
894
+ inputs=[file_input],
895
+ outputs=[current_text, next_text, progress, status, dataset_info]
896
+ )
897
+
898
+ font_select.change(
899
+ update_font,
900
+ inputs=[font_select],
901
+ outputs=[current_text, next_text, status]
902
+ )
903
+
904
+ add_font_btn.click(
905
+ add_custom_font,
906
+ inputs=[font_file_input],
907
+ outputs=[font_select, status]
908
+ )
909
+
910
+ save_btn.click(
911
+ save_current_recording,
912
+ inputs=[audio_recorder, speaker_id, dataset_name],
913
+ outputs=[current_text, next_text, progress, status, dataset_info,
914
+ download_audio, download_transcript, download_all, recordings_display,
915
+ audio_recorder] # Add audio_recorder to outputs
916
+ )
917
+
918
+ prev_btn.click(
919
+ lambda: navigate_sentences("prev"),
920
+ outputs=[current_text, next_text, progress, status]
921
+ )
922
+
923
+ next_btn.click(
924
+ lambda: navigate_sentences("next"),
925
+ outputs=[current_text, next_text, progress, status]
926
+ )
927
+
928
+ # Initialize dataset info
929
+ dataset_info.value = collector.get_dataset_statistics()
930
+
931
+ return interface
932
+
933
+ if __name__ == "__main__":
934
+ try:
935
+ # Set up any required environment variables
936
+ os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
937
+ os.environ["GRADIO_SERVER_PORT"] = "7860"
938
 
939
+ # Create and launch the interface
940
+ interface = create_interface()
941
+ interface.queue() # Enable queuing for better handling of concurrent users
942
+ interface.launch(
943
+ server_name="0.0.0.0",
944
+ server_port=7860,
945
+ share=True,
946
+ debug=True,
947
+ show_error=True
948
+ )
949
+ except Exception as e:
950
+ logger.error(f"Failed to launch interface: {str(e)}")
951
+ logger.error(traceback.format_exc())
952
+ raise