File size: 6,625 Bytes
33faa47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68d943b
 
 
 
33faa47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68d943b
33faa47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32064f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import subprocess
import re
import string
from fastapi import FastAPI, Request
from pydantic import BaseModel
from hazm import POSTagger, word_tokenize
from parsnorm import ParsNorm

app = FastAPI()
# Setup
normalizer = ParsNorm(remove_diacritics=False)
tagger = POSTagger(model='./pos_tagger.model')  # Make sure this model is present
punctuation = string.punctuation + "؟:؛»«،"
pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"

ambiguity_dict = {
    'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
    'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
    'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
    'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
    'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
    'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
    'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
    'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
    'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
    'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
    'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
    'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
    'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
    'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
    'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
    'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
    'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
    'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
    'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
    'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
    'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
    'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
    'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
    'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
    'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
    'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
    'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
    'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
    'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
    'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
    'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
    'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
    'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
    'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
}
def get_phoneme_for_pos(entry, target_pos):
    for i, pos_tag in enumerate(entry['pos']):
        if pos_tag == target_pos:
            return entry['phonemes'][i]
    return None  # Return None if target POS tag is not found


def get_phonemes(word):
    """Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
    cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
    try:
        # Run the subprocess with 'latin1' encoding to handle special characters
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        # Remove apostrophes from phonemes and strip any unwanted spaces or newlines
        return result.stdout.strip()
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError: {e}\n{word}")
        return None  # Or handle the error appropriately


def process_sentence(sentence, tagger, pattern, punctuation):
    sentence = re.sub(pattern, r' ', sentence)
    """Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
    words = word_tokenize(sentence)
    tagged_words = tagger.tag(words)

    phoneme_list = []
    tag_index = 0  # Track the index of words that get POS tags

    for word in words:
        if word in punctuation:
            if phoneme_list:
                phoneme_list[-1] += word
            else:
                phoneme_list.append(word)
        else: # If it's a word, process normally
            word = word.replace('_', ' ').replace('\u200c', ' ')
            phonemes = get_phonemes(word)
            kaamel_phonemes = ambiguity_dict.get(word)
            if kaamel_phonemes:
                if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
                    phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))

            # If word has Ezafe (EZ tag), modify phoneme
            if 'EZ' in tagged_words[tag_index][1]:
                if phonemes.endswith('jeː'):
                    pass
                elif phonemes.endswith('ː'):  # Ends in long vowel
                    phonemes += 'je'
                elif phonemes.endswith('i'): # e.g زندگی
                    phonemes += 'je'
                elif phonemes.endswith('je'): # e.g برای
                    pass
                elif phonemes.endswith('e'): # e.g مدرسه
                    phonemes += 'je'
                else:
                    phonemes += 'e'

            phoneme_list.append(phonemes)
        tag_index += 1  # Move to next tagged word
        
    phoneme_text = ' '.join(phoneme_list)
    phoneme_text = re.sub(r"\s+", " ", phoneme_text)

    return phoneme_text

# FastAPI input model
class InputText(BaseModel):
    text: str

# Route
@app.get("/")
async def root():
    return {"message": "Welcome to the Persian Phonemizer API. Use the /phonemize endpoint to process text."}

@app.post("/phonemize")
async def phonemize(input_data: InputText):
    normalized = normalizer.normalize(input_data.text, remove_punct=False)
    result = process_sentence(normalized, tagger, pattern, punctuation)
    return {"phonemes": result}