Add files using upload-large-folder tool

9fd672f verified 11 months ago

8.41 kB

	import re
	from typing import List, Set

	# Hindi text normalization rules
	_hindi_numbers = "०१२३४५६७८९"
	_english_numbers = "0123456789"
	_number_map = str.maketrans(_hindi_numbers, _english_numbers)

	# Common abbreviations and their expansions
	_abbreviations = {
	# Titles and honorifics
	"डॉ": "डॉक्टर",
	"श्री": "श्रीमान",
	"श्रीमती": "श्रीमती",
	"कु": "कुमारी",
	"प्रो": "प्रोफेसर",
	"चौ": "चौधरी",
	"स्व": "स्वर्गीय",

	# Common organizations
	"भा": "भारत",
	"सं": "संघ",
	"वि": "विश्वविद्यालय",
	"म": "महा",

	# Common words
	"क्र": "क्रमांक",
	"रु": "रुपये",
	"ज़ि": "ज़िला",
	"उ": "उत्तर",
	"द": "दक्षिण",
	"पू": "पूर्व",
	"प": "पश्चिम"
	}

	# Common conjunct consonants (consonant clusters)
	_common_conjuncts = {
	# क-based conjuncts
	"क्क", "क्त", "क्र", "क्ल", "क्व", "क्ष", "क्स",
	# ग-based conjuncts
	"ग्र", "ग्ल", "ग्व", "ग्न", "ग्म",
	# च-based conjuncts
	"च्च", "च्छ", "च्य", "च्र",
	# ज-based conjuncts
	"ज्ज", "ज्ञ", "ज्य", "ज्र", "ज्व",
	# त-based conjuncts
	"त्त", "त्र", "त्य", "त्व", "त्न", "त्म",
	# द-based conjuncts
	"द्द", "द्य", "द्व", "द्र", "द्म", "द्ध",
	# न-based conjuncts
	"न्न", "न्त", "न्द", "न्य", "न्र", "न्व",
	# प-based conjuncts
	"प्प", "प्त", "प्र", "प्ल", "प्स",
	# ब-based conjuncts
	"ब्र", "ब्ल", "ब्ज",
	# म-based conjuncts
	"म्प", "म्ब", "म्म", "म्ल", "म्र",
	# य-based conjuncts
	"य्य", "य्र",
	# र-based conjuncts (reph forms)
	"र्क", "र्ग", "र्च", "र्ज", "र्त", "र्द", "र्प", "र्ब", "र्म", "र्य", "र्ल", "र्व", "र्श", "र्स", "र्ह",
	# ल-based conjuncts
	"ल्क", "ल्ग", "ल्ट", "ल्ड", "ल्प", "ल्म", "ल्ल", "ल्व",
	# श-based conjuncts
	"श्च", "श्न", "श्प", "श्म", "श्य", "श्र", "श्ल", "श्व",
	# स-based conjuncts
	"स्क", "स्ट", "स्त", "स्थ", "स्न", "स्प", "स्फ", "स्म", "स्य", "स्र", "स्व", "स्स",
	# ह-based conjuncts
	"ह्य", "ह्र", "ह्व", "ह्ल", "ह्न", "ह्म"
	}

	def _is_final_position(text: str, pos: int) -> bool:
	"""Check if the position is at the end of a word."""
	return pos == len(text) - 1 or text[pos + 1] in {' ', ',', '.', '!', '?', '-'}

	def _is_light_syllable(text: str, pos: int) -> bool:
	"""Check if the syllable at given position is light (no long vowels or conjuncts)."""
	if pos >= len(text) - 1:
	return True
	next_char = text[pos + 1]
	return not (('\u093E' <= next_char <= '\u094C') or next_char == '\u094D')

	def get_schwa_deletion_positions(text: str) -> Set[int]:
	"""Determine positions where schwa should be deleted in Hindi words.
	Enhanced with more accurate rules."""
	positions = set()
	words = text.split()

	for word in words:
	word_start = text.find(word)
	length = len(word)

	for i in range(length):
	pos = word_start + i

	# Basic conditions for schwa deletion
	if ('\u0915' <= word[i] <= '\u0939' and # Current char is consonant
	i < length - 1 and
	not '\u093E' <= word[i + 1] <= '\u094D'): # Next char is not a vowel mark

	# Rule 1: Delete schwa in final position of word
	if i == length - 1:
	positions.add(pos)
	continue

	# Rule 2: Delete schwa between consonants in non-final light syllables
	if (i < length - 2 and
	'\u0915' <= word[i + 1] <= '\u0939' and # Next char is consonant
	_is_light_syllable(word, i)):
	positions.add(pos)
	continue

	# Rule 3: Delete schwa in compound words at morpheme boundaries
	if (i < length - 2 and
	word[i + 1] == '\u094D' and # Virama
	'\u0915' <= word[i + 2] <= '\u0939'): # Followed by consonant
	positions.add(pos)
	continue

	return positions

	def normalize_hindi_text(text: str) -> str:
	"""Normalize Hindi text by applying various rules."""
	# Convert Hindi numbers to English numbers
	text = text.translate(_number_map)

	# Replace abbreviations with their full forms
	for abbr, full in _abbreviations.items():
	text = re.sub(rf'\b{abbr}\b', full, text)

	# Remove extra spaces
	text = re.sub(r'\s+', ' ', text)

	# Normalize chandrabindu to anusvara
	text = text.replace('\u0901', '\u0902')

	# Normalize nukta variations
	nukta_chars = {
	'क़': 'क', 'ख़': 'ख', 'ग़': 'ग', 'ज़': 'ज',
	'ड़': 'ड', 'ढ़': 'ढ', 'फ़': 'फ'
	}
	for nuk, base in nukta_chars.items():
	text = text.replace(nuk, base)

	# Remove any non-Devanagari characters except basic punctuation
	text = re.sub(r'[^\u0900-\u097F\s.,!?-]', '', text)

	return text.strip()

	def hindi_to_phonemes(text: str) -> str:
	"""Convert Hindi text to phonemes."""
	text = normalize_hindi_text(text)
	phonemes = []

	i = 0
	while i < len(text):
	char = text[i]

	# Skip spaces and punctuation
	if char in ' .,!?-':
	if char == ' ':
	phonemes.append('SP')
	else:
	phonemes.append(char)
	i += 1
	continue

	# Handle consonants
	if '\u0915' <= char <= '\u0939':
	# Check for conjuncts
	if i + 2 < len(text) and text[i + 1] == '\u094D':
	# Look ahead for multi-consonant conjuncts
	j = i + 2
	conjunct = text[i:j + 1]
	while (j < len(text) and text[j] != ' ' and
	conjunct in _common_conjuncts):
	j += 1
	if j < len(text) and text[j-1] == '\u094D':
	conjunct = text[i:j + 1]
	else:
	break

	if conjunct[:-1] in _common_conjuncts:
	phonemes.append(conjunct[:-1])
	i = j
	else:
	# Handle as single consonant if not a known conjunct
	phonemes.append(char)
	i += 1
	else:
	# Single consonant
	phonemes.append(char)

	# Check for vowel marks
	if i + 1 < len(text) and '\u093E' <= text[i + 1] <= '\u094C':
	phonemes.append(text[i + 1])
	i += 2
	else:
	# Add implicit 'अ' if no vowel mark and not a schwa deletion position
	if i not in get_schwa_deletion_positions(text):
	phonemes.append('अ')
	i += 1

	# Handle independent vowels
	elif '\u0904' <= char <= '\u0914':
	phonemes.append(char)
	i += 1

	# Handle anusvara and visarga
	elif char in ['\u0902', '\u0903']:
	phonemes.append(char)
	i += 1

	else:
	i += 1

	return ' '.join(phonemes)

	def get_phoneme_sequence(text: str) -> List[str]:
	"""Convert text to a sequence of phonemes for the model."""
	phoneme_string = hindi_to_phonemes(text)
	return phoneme_string.split()