Spaces:

saeedzou
/

Persian_Phonemizer

Sleeping

App Files Files Community

Persian_Phonemizer / phonemizer.py

saeedzou

Update phonemizer.py

68d943b verified 7 days ago

raw

history blame contribute delete

6.63 kB

	import subprocess
	import re
	import string
	from fastapi import FastAPI, Request
	from pydantic import BaseModel
	from hazm import POSTagger, word_tokenize
	from parsnorm import ParsNorm

	app = FastAPI()
	# Setup
	normalizer = ParsNorm(remove_diacritics=False)
	tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present
	punctuation = string.punctuation + "؟:؛»«،"
	pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])\|(?<=[{re.escape(punctuation)}])(?=\w)"

	ambiguity_dict = {
	'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
	'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
	'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
	'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
	'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
	'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
	'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
	'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
	'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
	'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
	'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
	'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
	'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
	'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
	'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
	'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
	'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
	'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
	'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
	'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
	'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
	}
	def get_phoneme_for_pos(entry, target_pos):
	for i, pos_tag in enumerate(entry['pos']):
	if pos_tag == target_pos:
	return entry['phonemes'][i]
	return None # Return None if target POS tag is not found


	def get_phonemes(word):
	"""Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
	cmd = f'espeak-ng -v fa --ipa -q "{word}" \| sed "s/[ˈˌː]//g" \| sed "s/q1/q/g"'
	try:
	# Run the subprocess with 'latin1' encoding to handle special characters
	result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
	# Remove apostrophes from phonemes and strip any unwanted spaces or newlines
	return result.stdout.strip()
	except UnicodeDecodeError as e:
	print(f"UnicodeDecodeError: {e}\n{word}")
	return None # Or handle the error appropriately


	def process_sentence(sentence, tagger, pattern, punctuation):
	sentence = re.sub(pattern, r' ', sentence)
	"""Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
	words = word_tokenize(sentence)
	tagged_words = tagger.tag(words)

	phoneme_list = []
	tag_index = 0 # Track the index of words that get POS tags

	for word in words:
	if word in punctuation:
	if phoneme_list:
	phoneme_list[-1] += word
	else:
	phoneme_list.append(word)
	else: # If it's a word, process normally
	word = word.replace('_', ' ').replace('\u200c', ' ')
	phonemes = get_phonemes(word)
	kaamel_phonemes = ambiguity_dict.get(word)
	if kaamel_phonemes:
	if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
	phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))

	# If word has Ezafe (EZ tag), modify phoneme
	if 'EZ' in tagged_words[tag_index][1]:
	if phonemes.endswith('jeː'):
	pass
	elif phonemes.endswith('ː'): # Ends in long vowel
	phonemes += 'je'
	elif phonemes.endswith('i'): # e.g زندگی
	phonemes += 'je'
	elif phonemes.endswith('je'): # e.g برای
	pass
	elif phonemes.endswith('e'): # e.g مدرسه
	phonemes += 'je'
	else:
	phonemes += 'e'

	phoneme_list.append(phonemes)
	tag_index += 1 # Move to next tagged word

	phoneme_text = ' '.join(phoneme_list)
	phoneme_text = re.sub(r"\s+", " ", phoneme_text)

	return phoneme_text

	# FastAPI input model
	class InputText(BaseModel):
	text: str

	# Route
	@app.get("/")
	async def root():
	return {"message": "Welcome to the Persian Phonemizer API. Use the /phonemize endpoint to process text."}

	@app.post("/phonemize")
	async def phonemize(input_data: InputText):
	normalized = normalizer.normalize(input_data.text, remove_punct=False)
	result = process_sentence(normalized, tagger, pattern, punctuation)
	return {"phonemes": result}