sound / GPT_SoVITS /text /hindi.py
Alyosha11's picture
Add files using upload-large-folder tool
9fd672f verified
import re
from typing import List, Set
# Hindi text normalization rules
_hindi_numbers = "०१२३४५६७८९"
_english_numbers = "0123456789"
_number_map = str.maketrans(_hindi_numbers, _english_numbers)
# Common abbreviations and their expansions
_abbreviations = {
# Titles and honorifics
"डॉ": "डॉक्टर",
"श्री": "श्रीमान",
"श्रीमती": "श्रीमती",
"कु": "कुमारी",
"प्रो": "प्रोफेसर",
"चौ": "चौधरी",
"स्व": "स्वर्गीय",
# Common organizations
"भा": "भारत",
"सं": "संघ",
"वि": "विश्वविद्यालय",
"म": "महा",
# Common words
"क्र": "क्रमांक",
"रु": "रुपये",
"ज़ि": "ज़िला",
"उ": "उत्तर",
"द": "दक्षिण",
"पू": "पूर्व",
"प": "पश्चिम"
}
# Common conjunct consonants (consonant clusters)
_common_conjuncts = {
# क-based conjuncts
"क्क", "क्त", "क्र", "क्ल", "क्व", "क्ष", "क्स",
# ग-based conjuncts
"ग्र", "ग्ल", "ग्व", "ग्न", "ग्म",
# च-based conjuncts
"च्च", "च्छ", "च्य", "च्र",
# ज-based conjuncts
"ज्ज", "ज्ञ", "ज्य", "ज्र", "ज्व",
# त-based conjuncts
"त्त", "त्र", "त्य", "त्व", "त्न", "त्म",
# द-based conjuncts
"द्द", "द्य", "द्व", "द्र", "द्म", "द्ध",
# न-based conjuncts
"न्न", "न्त", "न्द", "न्य", "न्र", "न्व",
# प-based conjuncts
"प्प", "प्त", "प्र", "प्ल", "प्स",
# ब-based conjuncts
"ब्र", "ब्ल", "ब्ज",
# म-based conjuncts
"म्प", "म्ब", "म्म", "म्ल", "म्र",
# य-based conjuncts
"य्य", "य्र",
# र-based conjuncts (reph forms)
"र्क", "र्ग", "र्च", "र्ज", "र्त", "र्द", "र्प", "र्ब", "र्म", "र्य", "र्ल", "र्व", "र्श", "र्स", "र्ह",
# ल-based conjuncts
"ल्क", "ल्ग", "ल्ट", "ल्ड", "ल्प", "ल्म", "ल्ल", "ल्व",
# श-based conjuncts
"श्च", "श्न", "श्प", "श्म", "श्य", "श्र", "श्ल", "श्व",
# स-based conjuncts
"स्क", "स्ट", "स्त", "स्थ", "स्न", "स्प", "स्फ", "स्म", "स्य", "स्र", "स्व", "स्स",
# ह-based conjuncts
"ह्य", "ह्र", "ह्व", "ह्ल", "ह्न", "ह्म"
}
def _is_final_position(text: str, pos: int) -> bool:
"""Check if the position is at the end of a word."""
return pos == len(text) - 1 or text[pos + 1] in {' ', ',', '.', '!', '?', '-'}
def _is_light_syllable(text: str, pos: int) -> bool:
"""Check if the syllable at given position is light (no long vowels or conjuncts)."""
if pos >= len(text) - 1:
return True
next_char = text[pos + 1]
return not (('\u093E' <= next_char <= '\u094C') or next_char == '\u094D')
def get_schwa_deletion_positions(text: str) -> Set[int]:
"""Determine positions where schwa should be deleted in Hindi words.
Enhanced with more accurate rules."""
positions = set()
words = text.split()
for word in words:
word_start = text.find(word)
length = len(word)
for i in range(length):
pos = word_start + i
# Basic conditions for schwa deletion
if ('\u0915' <= word[i] <= '\u0939' and # Current char is consonant
i < length - 1 and
not '\u093E' <= word[i + 1] <= '\u094D'): # Next char is not a vowel mark
# Rule 1: Delete schwa in final position of word
if i == length - 1:
positions.add(pos)
continue
# Rule 2: Delete schwa between consonants in non-final light syllables
if (i < length - 2 and
'\u0915' <= word[i + 1] <= '\u0939' and # Next char is consonant
_is_light_syllable(word, i)):
positions.add(pos)
continue
# Rule 3: Delete schwa in compound words at morpheme boundaries
if (i < length - 2 and
word[i + 1] == '\u094D' and # Virama
'\u0915' <= word[i + 2] <= '\u0939'): # Followed by consonant
positions.add(pos)
continue
return positions
def normalize_hindi_text(text: str) -> str:
"""Normalize Hindi text by applying various rules."""
# Convert Hindi numbers to English numbers
text = text.translate(_number_map)
# Replace abbreviations with their full forms
for abbr, full in _abbreviations.items():
text = re.sub(rf'\b{abbr}\b', full, text)
# Remove extra spaces
text = re.sub(r'\s+', ' ', text)
# Normalize chandrabindu to anusvara
text = text.replace('\u0901', '\u0902')
# Normalize nukta variations
nukta_chars = {
'क़': 'क', 'ख़': 'ख', 'ग़': 'ग', 'ज़': 'ज',
'ड़': 'ड', 'ढ़': 'ढ', 'फ़': 'फ'
}
for nuk, base in nukta_chars.items():
text = text.replace(nuk, base)
# Remove any non-Devanagari characters except basic punctuation
text = re.sub(r'[^\u0900-\u097F\s.,!?-]', '', text)
return text.strip()
def hindi_to_phonemes(text: str) -> str:
"""Convert Hindi text to phonemes."""
text = normalize_hindi_text(text)
phonemes = []
i = 0
while i < len(text):
char = text[i]
# Skip spaces and punctuation
if char in ' .,!?-':
if char == ' ':
phonemes.append('SP')
else:
phonemes.append(char)
i += 1
continue
# Handle consonants
if '\u0915' <= char <= '\u0939':
# Check for conjuncts
if i + 2 < len(text) and text[i + 1] == '\u094D':
# Look ahead for multi-consonant conjuncts
j = i + 2
conjunct = text[i:j + 1]
while (j < len(text) and text[j] != ' ' and
conjunct in _common_conjuncts):
j += 1
if j < len(text) and text[j-1] == '\u094D':
conjunct = text[i:j + 1]
else:
break
if conjunct[:-1] in _common_conjuncts:
phonemes.append(conjunct[:-1])
i = j
else:
# Handle as single consonant if not a known conjunct
phonemes.append(char)
i += 1
else:
# Single consonant
phonemes.append(char)
# Check for vowel marks
if i + 1 < len(text) and '\u093E' <= text[i + 1] <= '\u094C':
phonemes.append(text[i + 1])
i += 2
else:
# Add implicit 'अ' if no vowel mark and not a schwa deletion position
if i not in get_schwa_deletion_positions(text):
phonemes.append('अ')
i += 1
# Handle independent vowels
elif '\u0904' <= char <= '\u0914':
phonemes.append(char)
i += 1
# Handle anusvara and visarga
elif char in ['\u0902', '\u0903']:
phonemes.append(char)
i += 1
else:
i += 1
return ' '.join(phonemes)
def get_phoneme_sequence(text: str) -> List[str]:
"""Convert text to a sequence of phonemes for the model."""
phoneme_string = hindi_to_phonemes(text)
return phoneme_string.split()