Spaces:
Running
Running
File size: 6,625 Bytes
33faa47 68d943b 33faa47 68d943b 33faa47 32064f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import subprocess
import re
import string
from fastapi import FastAPI, Request
from pydantic import BaseModel
from hazm import POSTagger, word_tokenize
from parsnorm import ParsNorm
app = FastAPI()
# Setup
normalizer = ParsNorm(remove_diacritics=False)
tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present
punctuation = string.punctuation + "؟:؛»«،"
pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"
ambiguity_dict = {
'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
}
def get_phoneme_for_pos(entry, target_pos):
for i, pos_tag in enumerate(entry['pos']):
if pos_tag == target_pos:
return entry['phonemes'][i]
return None # Return None if target POS tag is not found
def get_phonemes(word):
"""Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
try:
# Run the subprocess with 'latin1' encoding to handle special characters
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
# Remove apostrophes from phonemes and strip any unwanted spaces or newlines
return result.stdout.strip()
except UnicodeDecodeError as e:
print(f"UnicodeDecodeError: {e}\n{word}")
return None # Or handle the error appropriately
def process_sentence(sentence, tagger, pattern, punctuation):
sentence = re.sub(pattern, r' ', sentence)
"""Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
words = word_tokenize(sentence)
tagged_words = tagger.tag(words)
phoneme_list = []
tag_index = 0 # Track the index of words that get POS tags
for word in words:
if word in punctuation:
if phoneme_list:
phoneme_list[-1] += word
else:
phoneme_list.append(word)
else: # If it's a word, process normally
word = word.replace('_', ' ').replace('\u200c', ' ')
phonemes = get_phonemes(word)
kaamel_phonemes = ambiguity_dict.get(word)
if kaamel_phonemes:
if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))
# If word has Ezafe (EZ tag), modify phoneme
if 'EZ' in tagged_words[tag_index][1]:
if phonemes.endswith('jeː'):
pass
elif phonemes.endswith('ː'): # Ends in long vowel
phonemes += 'je'
elif phonemes.endswith('i'): # e.g زندگی
phonemes += 'je'
elif phonemes.endswith('je'): # e.g برای
pass
elif phonemes.endswith('e'): # e.g مدرسه
phonemes += 'je'
else:
phonemes += 'e'
phoneme_list.append(phonemes)
tag_index += 1 # Move to next tagged word
phoneme_text = ' '.join(phoneme_list)
phoneme_text = re.sub(r"\s+", " ", phoneme_text)
return phoneme_text
# FastAPI input model
class InputText(BaseModel):
text: str
# Route
@app.get("/")
async def root():
return {"message": "Welcome to the Persian Phonemizer API. Use the /phonemize endpoint to process text."}
@app.post("/phonemize")
async def phonemize(input_data: InputText):
normalized = normalizer.normalize(input_data.text, remove_punct=False)
result = process_sentence(normalized, tagger, pattern, punctuation)
return {"phonemes": result} |