|
|
import re |
|
|
from typing import List, Set |
|
|
|
|
|
|
|
|
_hindi_numbers = "०१२३४५६७८९" |
|
|
_english_numbers = "0123456789" |
|
|
_number_map = str.maketrans(_hindi_numbers, _english_numbers) |
|
|
|
|
|
|
|
|
_abbreviations = { |
|
|
|
|
|
"डॉ": "डॉक्टर", |
|
|
"श्री": "श्रीमान", |
|
|
"श्रीमती": "श्रीमती", |
|
|
"कु": "कुमारी", |
|
|
"प्रो": "प्रोफेसर", |
|
|
"चौ": "चौधरी", |
|
|
"स्व": "स्वर्गीय", |
|
|
|
|
|
|
|
|
"भा": "भारत", |
|
|
"सं": "संघ", |
|
|
"वि": "विश्वविद्यालय", |
|
|
"म": "महा", |
|
|
|
|
|
|
|
|
"क्र": "क्रमांक", |
|
|
"रु": "रुपये", |
|
|
"ज़ि": "ज़िला", |
|
|
"उ": "उत्तर", |
|
|
"द": "दक्षिण", |
|
|
"पू": "पूर्व", |
|
|
"प": "पश्चिम" |
|
|
} |
|
|
|
|
|
|
|
|
_common_conjuncts = { |
|
|
|
|
|
"क्क", "क्त", "क्र", "क्ल", "क्व", "क्ष", "क्स", |
|
|
|
|
|
"ग्र", "ग्ल", "ग्व", "ग्न", "ग्म", |
|
|
|
|
|
"च्च", "च्छ", "च्य", "च्र", |
|
|
|
|
|
"ज्ज", "ज्ञ", "ज्य", "ज्र", "ज्व", |
|
|
|
|
|
"त्त", "त्र", "त्य", "त्व", "त्न", "त्म", |
|
|
|
|
|
"द्द", "द्य", "द्व", "द्र", "द्म", "द्ध", |
|
|
|
|
|
"न्न", "न्त", "न्द", "न्य", "न्र", "न्व", |
|
|
|
|
|
"प्प", "प्त", "प्र", "प्ल", "प्स", |
|
|
|
|
|
"ब्र", "ब्ल", "ब्ज", |
|
|
|
|
|
"म्प", "म्ब", "म्म", "म्ल", "म्र", |
|
|
|
|
|
"य्य", "य्र", |
|
|
|
|
|
"र्क", "र्ग", "र्च", "र्ज", "र्त", "र्द", "र्प", "र्ब", "र्म", "र्य", "र्ल", "र्व", "र्श", "र्स", "र्ह", |
|
|
|
|
|
"ल्क", "ल्ग", "ल्ट", "ल्ड", "ल्प", "ल्म", "ल्ल", "ल्व", |
|
|
|
|
|
"श्च", "श्न", "श्प", "श्म", "श्य", "श्र", "श्ल", "श्व", |
|
|
|
|
|
"स्क", "स्ट", "स्त", "स्थ", "स्न", "स्प", "स्फ", "स्म", "स्य", "स्र", "स्व", "स्स", |
|
|
|
|
|
"ह्य", "ह्र", "ह्व", "ह्ल", "ह्न", "ह्म" |
|
|
} |
|
|
|
|
|
def _is_final_position(text: str, pos: int) -> bool: |
|
|
"""Check if the position is at the end of a word.""" |
|
|
return pos == len(text) - 1 or text[pos + 1] in {' ', ',', '.', '!', '?', '-'} |
|
|
|
|
|
def _is_light_syllable(text: str, pos: int) -> bool: |
|
|
"""Check if the syllable at given position is light (no long vowels or conjuncts).""" |
|
|
if pos >= len(text) - 1: |
|
|
return True |
|
|
next_char = text[pos + 1] |
|
|
return not (('\u093E' <= next_char <= '\u094C') or next_char == '\u094D') |
|
|
|
|
|
def get_schwa_deletion_positions(text: str) -> Set[int]: |
|
|
"""Determine positions where schwa should be deleted in Hindi words. |
|
|
Enhanced with more accurate rules.""" |
|
|
positions = set() |
|
|
words = text.split() |
|
|
|
|
|
for word in words: |
|
|
word_start = text.find(word) |
|
|
length = len(word) |
|
|
|
|
|
for i in range(length): |
|
|
pos = word_start + i |
|
|
|
|
|
|
|
|
if ('\u0915' <= word[i] <= '\u0939' and |
|
|
i < length - 1 and |
|
|
not '\u093E' <= word[i + 1] <= '\u094D'): |
|
|
|
|
|
|
|
|
if i == length - 1: |
|
|
positions.add(pos) |
|
|
continue |
|
|
|
|
|
|
|
|
if (i < length - 2 and |
|
|
'\u0915' <= word[i + 1] <= '\u0939' and |
|
|
_is_light_syllable(word, i)): |
|
|
positions.add(pos) |
|
|
continue |
|
|
|
|
|
|
|
|
if (i < length - 2 and |
|
|
word[i + 1] == '\u094D' and |
|
|
'\u0915' <= word[i + 2] <= '\u0939'): |
|
|
positions.add(pos) |
|
|
continue |
|
|
|
|
|
return positions |
|
|
|
|
|
def normalize_hindi_text(text: str) -> str: |
|
|
"""Normalize Hindi text by applying various rules.""" |
|
|
|
|
|
text = text.translate(_number_map) |
|
|
|
|
|
|
|
|
for abbr, full in _abbreviations.items(): |
|
|
text = re.sub(rf'\b{abbr}\b', full, text) |
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
|
|
|
text = text.replace('\u0901', '\u0902') |
|
|
|
|
|
|
|
|
nukta_chars = { |
|
|
'क़': 'क', 'ख़': 'ख', 'ग़': 'ग', 'ज़': 'ज', |
|
|
'ड़': 'ड', 'ढ़': 'ढ', 'फ़': 'फ' |
|
|
} |
|
|
for nuk, base in nukta_chars.items(): |
|
|
text = text.replace(nuk, base) |
|
|
|
|
|
|
|
|
text = re.sub(r'[^\u0900-\u097F\s.,!?-]', '', text) |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
def hindi_to_phonemes(text: str) -> str: |
|
|
"""Convert Hindi text to phonemes.""" |
|
|
text = normalize_hindi_text(text) |
|
|
phonemes = [] |
|
|
|
|
|
i = 0 |
|
|
while i < len(text): |
|
|
char = text[i] |
|
|
|
|
|
|
|
|
if char in ' .,!?-': |
|
|
if char == ' ': |
|
|
phonemes.append('SP') |
|
|
else: |
|
|
phonemes.append(char) |
|
|
i += 1 |
|
|
continue |
|
|
|
|
|
|
|
|
if '\u0915' <= char <= '\u0939': |
|
|
|
|
|
if i + 2 < len(text) and text[i + 1] == '\u094D': |
|
|
|
|
|
j = i + 2 |
|
|
conjunct = text[i:j + 1] |
|
|
while (j < len(text) and text[j] != ' ' and |
|
|
conjunct in _common_conjuncts): |
|
|
j += 1 |
|
|
if j < len(text) and text[j-1] == '\u094D': |
|
|
conjunct = text[i:j + 1] |
|
|
else: |
|
|
break |
|
|
|
|
|
if conjunct[:-1] in _common_conjuncts: |
|
|
phonemes.append(conjunct[:-1]) |
|
|
i = j |
|
|
else: |
|
|
|
|
|
phonemes.append(char) |
|
|
i += 1 |
|
|
else: |
|
|
|
|
|
phonemes.append(char) |
|
|
|
|
|
|
|
|
if i + 1 < len(text) and '\u093E' <= text[i + 1] <= '\u094C': |
|
|
phonemes.append(text[i + 1]) |
|
|
i += 2 |
|
|
else: |
|
|
|
|
|
if i not in get_schwa_deletion_positions(text): |
|
|
phonemes.append('अ') |
|
|
i += 1 |
|
|
|
|
|
|
|
|
elif '\u0904' <= char <= '\u0914': |
|
|
phonemes.append(char) |
|
|
i += 1 |
|
|
|
|
|
|
|
|
elif char in ['\u0902', '\u0903']: |
|
|
phonemes.append(char) |
|
|
i += 1 |
|
|
|
|
|
else: |
|
|
i += 1 |
|
|
|
|
|
return ' '.join(phonemes) |
|
|
|
|
|
def get_phoneme_sequence(text: str) -> List[str]: |
|
|
"""Convert text to a sequence of phonemes for the model.""" |
|
|
phoneme_string = hindi_to_phonemes(text) |
|
|
return phoneme_string.split() |