Spaces:
Sleeping
Sleeping
Update phonemizer.py
Browse files- phonemizer.py +127 -123
phonemizer.py
CHANGED
@@ -1,124 +1,128 @@
|
|
1 |
-
import subprocess
|
2 |
-
import re
|
3 |
-
import string
|
4 |
-
from fastapi import FastAPI, Request
|
5 |
-
from pydantic import BaseModel
|
6 |
-
from hazm import POSTagger, word_tokenize
|
7 |
-
from parsnorm import ParsNorm
|
8 |
-
|
9 |
-
app = FastAPI()
|
10 |
-
# Setup
|
11 |
-
normalizer = ParsNorm(remove_diacritics=False)
|
12 |
-
tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present
|
13 |
-
punctuation = string.punctuation + "؟:؛»«،"
|
14 |
-
pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"
|
15 |
-
|
16 |
-
ambiguity_dict = {
|
17 |
-
'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
|
18 |
-
'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
|
19 |
-
'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
20 |
-
'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
|
21 |
-
'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
22 |
-
'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
|
23 |
-
'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
|
24 |
-
'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
|
25 |
-
'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
26 |
-
'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
|
27 |
-
'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
28 |
-
'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
29 |
-
'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
30 |
-
'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
31 |
-
'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
|
32 |
-
'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
|
33 |
-
'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
34 |
-
'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
35 |
-
'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
36 |
-
'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
37 |
-
'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
38 |
-
'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
39 |
-
'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
|
40 |
-
'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
41 |
-
'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
|
42 |
-
'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
43 |
-
'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
44 |
-
'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
45 |
-
'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
|
46 |
-
'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
47 |
-
'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
|
48 |
-
'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
49 |
-
'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
|
50 |
-
'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
|
51 |
-
}
|
52 |
-
def get_phoneme_for_pos(entry, target_pos):
|
53 |
-
for i, pos_tag in enumerate(entry['pos']):
|
54 |
-
if pos_tag == target_pos:
|
55 |
-
return entry['phonemes'][i]
|
56 |
-
return None # Return None if target POS tag is not found
|
57 |
-
|
58 |
-
|
59 |
-
def get_phonemes(word):
|
60 |
-
"""Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
|
61 |
-
cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
|
62 |
-
try:
|
63 |
-
# Run the subprocess with 'latin1' encoding to handle special characters
|
64 |
-
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
65 |
-
# Remove apostrophes from phonemes and strip any unwanted spaces or newlines
|
66 |
-
return result.stdout.strip()
|
67 |
-
except UnicodeDecodeError as e:
|
68 |
-
print(f"UnicodeDecodeError: {e}\n{word}")
|
69 |
-
return None # Or handle the error appropriately
|
70 |
-
|
71 |
-
|
72 |
-
def process_sentence(sentence, tagger, pattern, punctuation):
|
73 |
-
sentence = re.sub(pattern, r' ', sentence)
|
74 |
-
"""Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
|
75 |
-
words = word_tokenize(sentence)
|
76 |
-
tagged_words = tagger.tag(words)
|
77 |
-
|
78 |
-
phoneme_list = []
|
79 |
-
tag_index = 0 # Track the index of words that get POS tags
|
80 |
-
|
81 |
-
for word in words:
|
82 |
-
if word in punctuation:
|
83 |
-
phoneme_list.append(word)
|
84 |
-
else: # If it's a word, process normally
|
85 |
-
word = word.replace('_', ' ').replace('\u200c', ' ')
|
86 |
-
phonemes = get_phonemes(word)
|
87 |
-
kaamel_phonemes = ambiguity_dict.get(word)
|
88 |
-
if kaamel_phonemes:
|
89 |
-
if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
|
90 |
-
phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))
|
91 |
-
|
92 |
-
# If word has Ezafe (EZ tag), modify phoneme
|
93 |
-
if 'EZ' in tagged_words[tag_index][1]:
|
94 |
-
if phonemes.endswith('jeː'):
|
95 |
-
pass
|
96 |
-
elif phonemes.endswith('ː'): # Ends in long vowel
|
97 |
-
phonemes += 'je'
|
98 |
-
elif phonemes.endswith('i'): # e.g زندگی
|
99 |
-
phonemes += 'je'
|
100 |
-
elif phonemes.endswith('je'): # e.g برای
|
101 |
-
pass
|
102 |
-
elif phonemes.endswith('e'): # e.g مدرسه
|
103 |
-
phonemes += 'je'
|
104 |
-
else:
|
105 |
-
phonemes += 'e'
|
106 |
-
|
107 |
-
phoneme_list.append(phonemes)
|
108 |
-
tag_index += 1 # Move to next tagged word
|
109 |
-
|
110 |
-
phoneme_text = ' '.join(phoneme_list)
|
111 |
-
phoneme_text = re.sub(r"\s+", " ", phoneme_text)
|
112 |
-
|
113 |
-
return phoneme_text
|
114 |
-
|
115 |
-
# FastAPI input model
|
116 |
-
class InputText(BaseModel):
|
117 |
-
text: str
|
118 |
-
|
119 |
-
# Route
|
120 |
-
@app.
|
121 |
-
async def
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
124 |
return {"phonemes": result}
|
|
|
1 |
+
import subprocess
|
2 |
+
import re
|
3 |
+
import string
|
4 |
+
from fastapi import FastAPI, Request
|
5 |
+
from pydantic import BaseModel
|
6 |
+
from hazm import POSTagger, word_tokenize
|
7 |
+
from parsnorm import ParsNorm
|
8 |
+
|
9 |
+
app = FastAPI()
|
10 |
+
# Setup
|
11 |
+
normalizer = ParsNorm(remove_diacritics=False)
|
12 |
+
tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present
|
13 |
+
punctuation = string.punctuation + "؟:؛»«،"
|
14 |
+
pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"
|
15 |
+
|
16 |
+
ambiguity_dict = {
|
17 |
+
'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
|
18 |
+
'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
|
19 |
+
'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
20 |
+
'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
|
21 |
+
'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
22 |
+
'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
|
23 |
+
'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
|
24 |
+
'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
|
25 |
+
'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
26 |
+
'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
|
27 |
+
'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
28 |
+
'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
29 |
+
'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
30 |
+
'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
31 |
+
'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
|
32 |
+
'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
|
33 |
+
'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
34 |
+
'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
35 |
+
'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
36 |
+
'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
37 |
+
'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
38 |
+
'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
39 |
+
'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
|
40 |
+
'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
41 |
+
'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
|
42 |
+
'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
43 |
+
'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
44 |
+
'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
45 |
+
'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
|
46 |
+
'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
47 |
+
'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
|
48 |
+
'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
49 |
+
'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
|
50 |
+
'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
|
51 |
+
}
|
52 |
+
def get_phoneme_for_pos(entry, target_pos):
|
53 |
+
for i, pos_tag in enumerate(entry['pos']):
|
54 |
+
if pos_tag == target_pos:
|
55 |
+
return entry['phonemes'][i]
|
56 |
+
return None # Return None if target POS tag is not found
|
57 |
+
|
58 |
+
|
59 |
+
def get_phonemes(word):
|
60 |
+
"""Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
|
61 |
+
cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
|
62 |
+
try:
|
63 |
+
# Run the subprocess with 'latin1' encoding to handle special characters
|
64 |
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
65 |
+
# Remove apostrophes from phonemes and strip any unwanted spaces or newlines
|
66 |
+
return result.stdout.strip()
|
67 |
+
except UnicodeDecodeError as e:
|
68 |
+
print(f"UnicodeDecodeError: {e}\n{word}")
|
69 |
+
return None # Or handle the error appropriately
|
70 |
+
|
71 |
+
|
72 |
+
def process_sentence(sentence, tagger, pattern, punctuation):
|
73 |
+
sentence = re.sub(pattern, r' ', sentence)
|
74 |
+
"""Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
|
75 |
+
words = word_tokenize(sentence)
|
76 |
+
tagged_words = tagger.tag(words)
|
77 |
+
|
78 |
+
phoneme_list = []
|
79 |
+
tag_index = 0 # Track the index of words that get POS tags
|
80 |
+
|
81 |
+
for word in words:
|
82 |
+
if word in punctuation:
|
83 |
+
phoneme_list.append(word)
|
84 |
+
else: # If it's a word, process normally
|
85 |
+
word = word.replace('_', ' ').replace('\u200c', ' ')
|
86 |
+
phonemes = get_phonemes(word)
|
87 |
+
kaamel_phonemes = ambiguity_dict.get(word)
|
88 |
+
if kaamel_phonemes:
|
89 |
+
if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
|
90 |
+
phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))
|
91 |
+
|
92 |
+
# If word has Ezafe (EZ tag), modify phoneme
|
93 |
+
if 'EZ' in tagged_words[tag_index][1]:
|
94 |
+
if phonemes.endswith('jeː'):
|
95 |
+
pass
|
96 |
+
elif phonemes.endswith('ː'): # Ends in long vowel
|
97 |
+
phonemes += 'je'
|
98 |
+
elif phonemes.endswith('i'): # e.g زندگی
|
99 |
+
phonemes += 'je'
|
100 |
+
elif phonemes.endswith('je'): # e.g برای
|
101 |
+
pass
|
102 |
+
elif phonemes.endswith('e'): # e.g مدرسه
|
103 |
+
phonemes += 'je'
|
104 |
+
else:
|
105 |
+
phonemes += 'e'
|
106 |
+
|
107 |
+
phoneme_list.append(phonemes)
|
108 |
+
tag_index += 1 # Move to next tagged word
|
109 |
+
|
110 |
+
phoneme_text = ' '.join(phoneme_list)
|
111 |
+
phoneme_text = re.sub(r"\s+", " ", phoneme_text)
|
112 |
+
|
113 |
+
return phoneme_text
|
114 |
+
|
115 |
+
# FastAPI input model
|
116 |
+
class InputText(BaseModel):
|
117 |
+
text: str
|
118 |
+
|
119 |
+
# Route
|
120 |
+
@app.get("/")
|
121 |
+
async def root():
|
122 |
+
return {"message": "Welcome to the Persian Phonemizer API. Use the /phonemize endpoint to process text."}
|
123 |
+
|
124 |
+
@app.post("/phonemize")
|
125 |
+
async def phonemize(input_data: InputText):
|
126 |
+
normalized = normalizer.normalize(input_data.text, remove_punct=False)
|
127 |
+
result = process_sentence(normalized, tagger, pattern, punctuation)
|
128 |
return {"phonemes": result}
|