Spaces:

saeedzou
/

Persian_Phonemizer

Sleeping

App Files Files Community

saeedzou commited on Apr 8

Commit

33faa47

verified ·

1 Parent(s): 919f19c

Update phonemizer.py

Browse files

Files changed (1) hide show

phonemizer.py +127 -123

phonemizer.py CHANGED Viewed

@@ -1,124 +1,128 @@
-import subprocess
-import re
-import string
-from fastapi import FastAPI, Request
-from pydantic import BaseModel
-from hazm import POSTagger, word_tokenize
-from parsnorm import ParsNorm
-app = FastAPI()
-# Setup
-normalizer = ParsNorm(remove_diacritics=False)
-tagger = POSTagger(model='./pos_tagger.model')  # Make sure this model is present
-punctuation = string.punctuation + "؟:؛»«،"
-pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"
-ambiguity_dict = {
-    'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
-    'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
-    'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
-    'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
-    'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
-    'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
-    'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
-    'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
-    'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
-    'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
-    'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
-    'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
-    'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
-    'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
-    'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
-    'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
-    'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
-    'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
-    'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
-    'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
-    'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
-    'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
-    'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
-    'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
-    'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
-    'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
-    'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
-    'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
-    'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
-    'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
-    'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
-    'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
-    'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
-    'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
-}
-def get_phoneme_for_pos(entry, target_pos):
-    for i, pos_tag in enumerate(entry['pos']):
-        if pos_tag == target_pos:
-            return entry['phonemes'][i]
-    return None  # Return None if target POS tag is not found
-def get_phonemes(word):
-    """Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
-    cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
-    try:
-        # Run the subprocess with 'latin1' encoding to handle special characters
-        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
-        # Remove apostrophes from phonemes and strip any unwanted spaces or newlines
-        return result.stdout.strip()
-    except UnicodeDecodeError as e:
-        print(f"UnicodeDecodeError: {e}\n{word}")
-        return None  # Or handle the error appropriately
-def process_sentence(sentence, tagger, pattern, punctuation):
-    sentence = re.sub(pattern, r' ', sentence)
-    """Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
-    words = word_tokenize(sentence)
-    tagged_words = tagger.tag(words)
-    phoneme_list = []
-    tag_index = 0  # Track the index of words that get POS tags
-    for word in words:
-        if word in punctuation:
-            phoneme_list.append(word)
-        else: # If it's a word, process normally
-            word = word.replace('_', ' ').replace('\u200c', ' ')
-            phonemes = get_phonemes(word)
-            kaamel_phonemes = ambiguity_dict.get(word)
-            if kaamel_phonemes:
-                if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
-                    phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))
-            # If word has Ezafe (EZ tag), modify phoneme
-            if 'EZ' in tagged_words[tag_index][1]:
-                if phonemes.endswith('jeː'):
-                    pass
-                elif phonemes.endswith('ː'):  # Ends in long vowel
-                    phonemes += 'je'
-                elif phonemes.endswith('i'): # e.g زندگی
-                    phonemes += 'je'
-                elif phonemes.endswith('je'): # e.g برای
-                    pass
-                elif phonemes.endswith('e'): # e.g مدرسه
-                    phonemes += 'je'
-                else:
-                    phonemes += 'e'
-            phoneme_list.append(phonemes)
-        tag_index += 1  # Move to next tagged word
-    phoneme_text = ' '.join(phoneme_list)
-    phoneme_text = re.sub(r"\s+", " ", phoneme_text)
-    return phoneme_text
-# FastAPI input model
-class InputText(BaseModel):
-    text: str
-# Route
-@app.post("/phonemize")
-async def phonemize(input_data: InputText):
-    normalized = normalizer.normalize(input_data.text, remove_punct=False)
-    result = process_sentence(normalized, tagger, pattern, punctuation)
     return {"phonemes": result}

+import subprocess
+import re
+import string
+from fastapi import FastAPI, Request
+from pydantic import BaseModel
+from hazm import POSTagger, word_tokenize
+from parsnorm import ParsNorm
+app = FastAPI()
+# Setup
+normalizer = ParsNorm(remove_diacritics=False)
+tagger = POSTagger(model='./pos_tagger.model')  # Make sure this model is present
+punctuation = string.punctuation + "؟:؛»«،"
+pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"
+ambiguity_dict = {
+    'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
+    'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
+    'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
+    'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
+    'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
+    'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
+    'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
+    'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
+    'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
+    'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
+    'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
+    'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
+    'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
+    'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
+    'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
+    'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
+    'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
+    'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
+    'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
+    'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
+    'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
+    'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
+    'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
+    'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
+    'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
+    'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
+    'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
+    'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
+    'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
+    'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
+    'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
+    'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
+    'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
+    'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
+}
+def get_phoneme_for_pos(entry, target_pos):
+    for i, pos_tag in enumerate(entry['pos']):
+        if pos_tag == target_pos:
+            return entry['phonemes'][i]
+    return None  # Return None if target POS tag is not found
+def get_phonemes(word):
+    """Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
+    cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
+    try:
+        # Run the subprocess with 'latin1' encoding to handle special characters
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        # Remove apostrophes from phonemes and strip any unwanted spaces or newlines
+        return result.stdout.strip()
+    except UnicodeDecodeError as e:
+        print(f"UnicodeDecodeError: {e}\n{word}")
+        return None  # Or handle the error appropriately
+def process_sentence(sentence, tagger, pattern, punctuation):
+    sentence = re.sub(pattern, r' ', sentence)
+    """Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
+    words = word_tokenize(sentence)
+    tagged_words = tagger.tag(words)
+    phoneme_list = []
+    tag_index = 0  # Track the index of words that get POS tags
+    for word in words:
+        if word in punctuation:
+            phoneme_list.append(word)
+        else: # If it's a word, process normally
+            word = word.replace('_', ' ').replace('\u200c', ' ')
+            phonemes = get_phonemes(word)
+            kaamel_phonemes = ambiguity_dict.get(word)
+            if kaamel_phonemes:
+                if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
+                    phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))
+            # If word has Ezafe (EZ tag), modify phoneme
+            if 'EZ' in tagged_words[tag_index][1]:
+                if phonemes.endswith('jeː'):
+                    pass
+                elif phonemes.endswith('ː'):  # Ends in long vowel
+                    phonemes += 'je'
+                elif phonemes.endswith('i'): # e.g زندگی
+                    phonemes += 'je'
+                elif phonemes.endswith('je'): # e.g برای
+                    pass
+                elif phonemes.endswith('e'): # e.g مدرسه
+                    phonemes += 'je'
+                else:
+                    phonemes += 'e'
+            phoneme_list.append(phonemes)
+        tag_index += 1  # Move to next tagged word
+    phoneme_text = ' '.join(phoneme_list)
+    phoneme_text = re.sub(r"\s+", " ", phoneme_text)
+    return phoneme_text
+# FastAPI input model
+class InputText(BaseModel):
+    text: str
+# Route
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the Persian Phonemizer API. Use the /phonemize endpoint to process text."}
+@app.post("/phonemize")
+async def phonemize(input_data: InputText):
+    normalized = normalizer.normalize(input_data.text, remove_punct=False)
+    result = process_sentence(normalized, tagger, pattern, punctuation)
     return {"phonemes": result}