saeedzou commited on
Commit
33faa47
·
verified ·
1 Parent(s): 919f19c

Update phonemizer.py

Browse files
Files changed (1) hide show
  1. phonemizer.py +127 -123
phonemizer.py CHANGED
@@ -1,124 +1,128 @@
1
- import subprocess
2
- import re
3
- import string
4
- from fastapi import FastAPI, Request
5
- from pydantic import BaseModel
6
- from hazm import POSTagger, word_tokenize
7
- from parsnorm import ParsNorm
8
-
9
- app = FastAPI()
10
- # Setup
11
- normalizer = ParsNorm(remove_diacritics=False)
12
- tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present
13
- punctuation = string.punctuation + "؟:؛»«،"
14
- pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"
15
-
16
- ambiguity_dict = {
17
- 'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
18
- 'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
19
- 'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
20
- 'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
21
- 'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
22
- 'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
23
- 'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
24
- 'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
25
- 'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
26
- 'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
27
- 'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
28
- 'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
29
- 'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
30
- 'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
31
- 'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
32
- 'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
33
- 'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
34
- 'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
35
- 'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
36
- 'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
37
- 'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
38
- 'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
39
- 'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
40
- 'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
41
- 'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
42
- 'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
43
- 'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
44
- 'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
45
- 'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
46
- 'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
47
- 'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
48
- 'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
49
- 'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
50
- 'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
51
- }
52
- def get_phoneme_for_pos(entry, target_pos):
53
- for i, pos_tag in enumerate(entry['pos']):
54
- if pos_tag == target_pos:
55
- return entry['phonemes'][i]
56
- return None # Return None if target POS tag is not found
57
-
58
-
59
- def get_phonemes(word):
60
- """Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
61
- cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
62
- try:
63
- # Run the subprocess with 'latin1' encoding to handle special characters
64
- result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
65
- # Remove apostrophes from phonemes and strip any unwanted spaces or newlines
66
- return result.stdout.strip()
67
- except UnicodeDecodeError as e:
68
- print(f"UnicodeDecodeError: {e}\n{word}")
69
- return None # Or handle the error appropriately
70
-
71
-
72
- def process_sentence(sentence, tagger, pattern, punctuation):
73
- sentence = re.sub(pattern, r' ', sentence)
74
- """Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
75
- words = word_tokenize(sentence)
76
- tagged_words = tagger.tag(words)
77
-
78
- phoneme_list = []
79
- tag_index = 0 # Track the index of words that get POS tags
80
-
81
- for word in words:
82
- if word in punctuation:
83
- phoneme_list.append(word)
84
- else: # If it's a word, process normally
85
- word = word.replace('_', ' ').replace('\u200c', ' ')
86
- phonemes = get_phonemes(word)
87
- kaamel_phonemes = ambiguity_dict.get(word)
88
- if kaamel_phonemes:
89
- if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
90
- phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))
91
-
92
- # If word has Ezafe (EZ tag), modify phoneme
93
- if 'EZ' in tagged_words[tag_index][1]:
94
- if phonemes.endswith('jeː'):
95
- pass
96
- elif phonemes.endswith('ː'): # Ends in long vowel
97
- phonemes += 'je'
98
- elif phonemes.endswith('i'): # e.g زندگی
99
- phonemes += 'je'
100
- elif phonemes.endswith('je'): # e.g برای
101
- pass
102
- elif phonemes.endswith('e'): # e.g مدرسه
103
- phonemes += 'je'
104
- else:
105
- phonemes += 'e'
106
-
107
- phoneme_list.append(phonemes)
108
- tag_index += 1 # Move to next tagged word
109
-
110
- phoneme_text = ' '.join(phoneme_list)
111
- phoneme_text = re.sub(r"\s+", " ", phoneme_text)
112
-
113
- return phoneme_text
114
-
115
- # FastAPI input model
116
- class InputText(BaseModel):
117
- text: str
118
-
119
- # Route
120
- @app.post("/phonemize")
121
- async def phonemize(input_data: InputText):
122
- normalized = normalizer.normalize(input_data.text, remove_punct=False)
123
- result = process_sentence(normalized, tagger, pattern, punctuation)
 
 
 
 
124
  return {"phonemes": result}
 
1
+ import subprocess
2
+ import re
3
+ import string
4
+ from fastapi import FastAPI, Request
5
+ from pydantic import BaseModel
6
+ from hazm import POSTagger, word_tokenize
7
+ from parsnorm import ParsNorm
8
+
9
+ app = FastAPI()
10
+ # Setup
11
+ normalizer = ParsNorm(remove_diacritics=False)
12
+ tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present
13
+ punctuation = string.punctuation + "؟:؛»«،"
14
+ pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"
15
+
16
+ ambiguity_dict = {
17
+ 'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
18
+ 'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
19
+ 'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
20
+ 'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
21
+ 'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
22
+ 'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
23
+ 'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
24
+ 'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
25
+ 'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
26
+ 'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
27
+ 'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
28
+ 'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
29
+ 'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
30
+ 'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
31
+ 'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
32
+ 'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
33
+ 'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
34
+ 'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
35
+ 'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
36
+ 'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
37
+ 'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
38
+ 'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
39
+ 'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
40
+ 'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
41
+ 'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
42
+ 'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
43
+ 'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
44
+ 'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
45
+ 'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
46
+ 'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
47
+ 'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
48
+ 'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
49
+ 'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
50
+ 'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
51
+ }
52
+ def get_phoneme_for_pos(entry, target_pos):
53
+ for i, pos_tag in enumerate(entry['pos']):
54
+ if pos_tag == target_pos:
55
+ return entry['phonemes'][i]
56
+ return None # Return None if target POS tag is not found
57
+
58
+
59
+ def get_phonemes(word):
60
+ """Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
61
+ cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
62
+ try:
63
+ # Run the subprocess with 'latin1' encoding to handle special characters
64
+ result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
65
+ # Remove apostrophes from phonemes and strip any unwanted spaces or newlines
66
+ return result.stdout.strip()
67
+ except UnicodeDecodeError as e:
68
+ print(f"UnicodeDecodeError: {e}\n{word}")
69
+ return None # Or handle the error appropriately
70
+
71
+
72
+ def process_sentence(sentence, tagger, pattern, punctuation):
73
+ sentence = re.sub(pattern, r' ', sentence)
74
+ """Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
75
+ words = word_tokenize(sentence)
76
+ tagged_words = tagger.tag(words)
77
+
78
+ phoneme_list = []
79
+ tag_index = 0 # Track the index of words that get POS tags
80
+
81
+ for word in words:
82
+ if word in punctuation:
83
+ phoneme_list.append(word)
84
+ else: # If it's a word, process normally
85
+ word = word.replace('_', ' ').replace('\u200c', ' ')
86
+ phonemes = get_phonemes(word)
87
+ kaamel_phonemes = ambiguity_dict.get(word)
88
+ if kaamel_phonemes:
89
+ if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
90
+ phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))
91
+
92
+ # If word has Ezafe (EZ tag), modify phoneme
93
+ if 'EZ' in tagged_words[tag_index][1]:
94
+ if phonemes.endswith('jeː'):
95
+ pass
96
+ elif phonemes.endswith('ː'): # Ends in long vowel
97
+ phonemes += 'je'
98
+ elif phonemes.endswith('i'): # e.g زندگی
99
+ phonemes += 'je'
100
+ elif phonemes.endswith('je'): # e.g برای
101
+ pass
102
+ elif phonemes.endswith('e'): # e.g مدرسه
103
+ phonemes += 'je'
104
+ else:
105
+ phonemes += 'e'
106
+
107
+ phoneme_list.append(phonemes)
108
+ tag_index += 1 # Move to next tagged word
109
+
110
+ phoneme_text = ' '.join(phoneme_list)
111
+ phoneme_text = re.sub(r"\s+", " ", phoneme_text)
112
+
113
+ return phoneme_text
114
+
115
+ # FastAPI input model
116
+ class InputText(BaseModel):
117
+ text: str
118
+
119
+ # Route
120
+ @app.get("/")
121
+ async def root():
122
+ return {"message": "Welcome to the Persian Phonemizer API. Use the /phonemize endpoint to process text."}
123
+
124
+ @app.post("/phonemize")
125
+ async def phonemize(input_data: InputText):
126
+ normalized = normalizer.normalize(input_data.text, remove_punct=False)
127
+ result = process_sentence(normalized, tagger, pattern, punctuation)
128
  return {"phonemes": result}