Spaces:
Sleeping
Sleeping
import subprocess | |
import re | |
import string | |
from fastapi import FastAPI, Request | |
from pydantic import BaseModel | |
from hazm import POSTagger, word_tokenize | |
from parsnorm import ParsNorm | |
app = FastAPI() | |
# Setup | |
normalizer = ParsNorm(remove_diacritics=False) | |
tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present | |
punctuation = string.punctuation + "؟:؛»«،" | |
pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)" | |
ambiguity_dict = { | |
'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True}, | |
'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True}, | |
'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, | |
'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True}, | |
'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True}, | |
'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True}, | |
'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True}, | |
'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True}, | |
'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, | |
'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True}, | |
'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, | |
'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, | |
'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, | |
'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, | |
'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True}, | |
'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True}, | |
'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, | |
'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, | |
'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True}, | |
'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True}, | |
'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, | |
'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, | |
'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True}, | |
'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, | |
'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True}, | |
'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, | |
'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, | |
'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True}, | |
'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True}, | |
'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, | |
'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True}, | |
'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, | |
'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True}, | |
'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True}, | |
} | |
def get_phoneme_for_pos(entry, target_pos): | |
for i, pos_tag in enumerate(entry['pos']): | |
if pos_tag == target_pos: | |
return entry['phonemes'][i] | |
return None # Return None if target POS tag is not found | |
def get_phonemes(word): | |
"""Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes.""" | |
cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"' | |
try: | |
# Run the subprocess with 'latin1' encoding to handle special characters | |
result = subprocess.run(cmd, shell=True, capture_output=True, text=True) | |
# Remove apostrophes from phonemes and strip any unwanted spaces or newlines | |
return result.stdout.strip() | |
except UnicodeDecodeError as e: | |
print(f"UnicodeDecodeError: {e}\n{word}") | |
return None # Or handle the error appropriately | |
def process_sentence(sentence, tagger, pattern, punctuation): | |
sentence = re.sub(pattern, r' ', sentence) | |
"""Convert Persian text to phonemes with Ezafe handling while keeping punctuation.""" | |
words = word_tokenize(sentence) | |
tagged_words = tagger.tag(words) | |
phoneme_list = [] | |
tag_index = 0 # Track the index of words that get POS tags | |
for word in words: | |
if word in punctuation: | |
if phoneme_list: | |
phoneme_list[-1] += word | |
else: | |
phoneme_list.append(word) | |
else: # If it's a word, process normally | |
word = word.replace('_', ' ').replace('\u200c', ' ') | |
phonemes = get_phonemes(word) | |
kaamel_phonemes = ambiguity_dict.get(word) | |
if kaamel_phonemes: | |
if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']: | |
phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', '')) | |
# If word has Ezafe (EZ tag), modify phoneme | |
if 'EZ' in tagged_words[tag_index][1]: | |
if phonemes.endswith('jeː'): | |
pass | |
elif phonemes.endswith('ː'): # Ends in long vowel | |
phonemes += 'je' | |
elif phonemes.endswith('i'): # e.g زندگی | |
phonemes += 'je' | |
elif phonemes.endswith('je'): # e.g برای | |
pass | |
elif phonemes.endswith('e'): # e.g مدرسه | |
phonemes += 'je' | |
else: | |
phonemes += 'e' | |
phoneme_list.append(phonemes) | |
tag_index += 1 # Move to next tagged word | |
phoneme_text = ' '.join(phoneme_list) | |
phoneme_text = re.sub(r"\s+", " ", phoneme_text) | |
return phoneme_text | |
# FastAPI input model | |
class InputText(BaseModel): | |
text: str | |
# Route | |
async def root(): | |
return {"message": "Welcome to the Persian Phonemizer API. Use the /phonemize endpoint to process text."} | |
async def phonemize(input_data: InputText): | |
normalized = normalizer.normalize(input_data.text, remove_punct=False) | |
result = process_sentence(normalized, tagger, pattern, punctuation) | |
return {"phonemes": result} |