Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- Dockerfile +29 -0
- phonemizer.py +124 -0
- pos_tagger.model +3 -0
Dockerfile
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as the base image
|
2 |
+
FROM python:3.9-slim
|
3 |
+
|
4 |
+
# Set working directory in the container
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Install system dependencies
|
8 |
+
RUN apt-get update && apt-get install -y \
|
9 |
+
espeak-ng \
|
10 |
+
git \
|
11 |
+
&& rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
# Clone ParsNorm repository and install it
|
14 |
+
RUN git clone https://github.com/saeedzou/ParsNorm.git \
|
15 |
+
&& cd ParsNorm \
|
16 |
+
&& pip install -e . \
|
17 |
+
&& pip install -r requirements.txt \
|
18 |
+
&& pip install fastapi uvicorn
|
19 |
+
|
20 |
+
COPY pos_tagger.model .
|
21 |
+
|
22 |
+
# Copy your Python script into the container
|
23 |
+
COPY phonemizer.py .
|
24 |
+
|
25 |
+
# Expose the port FastAPI will run on
|
26 |
+
EXPOSE 7860
|
27 |
+
|
28 |
+
# Run the FastAPI app
|
29 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
phonemizer.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
import re
|
3 |
+
import string
|
4 |
+
from fastapi import FastAPI, Request
|
5 |
+
from pydantic import BaseModel
|
6 |
+
from hazm import POSTagger, word_tokenize
|
7 |
+
from parsnorm import ParsNorm
|
8 |
+
|
9 |
+
app = FastAPI()
|
10 |
+
# Setup
|
11 |
+
normalizer = ParsNorm(remove_diacritics=False)
|
12 |
+
tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present
|
13 |
+
punctuation = string.punctuation + "؟:؛»«،"
|
14 |
+
pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"
|
15 |
+
|
16 |
+
ambiguity_dict = {
|
17 |
+
'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
|
18 |
+
'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
|
19 |
+
'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
20 |
+
'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
|
21 |
+
'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
22 |
+
'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
|
23 |
+
'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
|
24 |
+
'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
|
25 |
+
'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
26 |
+
'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
|
27 |
+
'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
28 |
+
'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
29 |
+
'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
30 |
+
'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
31 |
+
'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
|
32 |
+
'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
|
33 |
+
'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
34 |
+
'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
35 |
+
'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
36 |
+
'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
37 |
+
'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
38 |
+
'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
39 |
+
'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
|
40 |
+
'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
41 |
+
'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
|
42 |
+
'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
43 |
+
'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
44 |
+
'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
45 |
+
'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
|
46 |
+
'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
47 |
+
'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
|
48 |
+
'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
49 |
+
'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
|
50 |
+
'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
|
51 |
+
}
|
52 |
+
def get_phoneme_for_pos(entry, target_pos):
|
53 |
+
for i, pos_tag in enumerate(entry['pos']):
|
54 |
+
if pos_tag == target_pos:
|
55 |
+
return entry['phonemes'][i]
|
56 |
+
return None # Return None if target POS tag is not found
|
57 |
+
|
58 |
+
|
59 |
+
def get_phonemes(word):
|
60 |
+
"""Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
|
61 |
+
cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
|
62 |
+
try:
|
63 |
+
# Run the subprocess with 'latin1' encoding to handle special characters
|
64 |
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
65 |
+
# Remove apostrophes from phonemes and strip any unwanted spaces or newlines
|
66 |
+
return result.stdout.strip()
|
67 |
+
except UnicodeDecodeError as e:
|
68 |
+
print(f"UnicodeDecodeError: {e}\n{word}")
|
69 |
+
return None # Or handle the error appropriately
|
70 |
+
|
71 |
+
|
72 |
+
def process_sentence(sentence, tagger, pattern, punctuation):
|
73 |
+
sentence = re.sub(pattern, r' ', sentence)
|
74 |
+
"""Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
|
75 |
+
words = word_tokenize(sentence)
|
76 |
+
tagged_words = tagger.tag(words)
|
77 |
+
|
78 |
+
phoneme_list = []
|
79 |
+
tag_index = 0 # Track the index of words that get POS tags
|
80 |
+
|
81 |
+
for word in words:
|
82 |
+
if word in punctuation:
|
83 |
+
phoneme_list.append(word)
|
84 |
+
else: # If it's a word, process normally
|
85 |
+
word = word.replace('_', ' ').replace('\u200c', ' ')
|
86 |
+
phonemes = get_phonemes(word)
|
87 |
+
kaamel_phonemes = ambiguity_dict.get(word)
|
88 |
+
if kaamel_phonemes:
|
89 |
+
if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
|
90 |
+
phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))
|
91 |
+
|
92 |
+
# If word has Ezafe (EZ tag), modify phoneme
|
93 |
+
if 'EZ' in tagged_words[tag_index][1]:
|
94 |
+
if phonemes.endswith('jeː'):
|
95 |
+
pass
|
96 |
+
elif phonemes.endswith('ː'): # Ends in long vowel
|
97 |
+
phonemes += 'je'
|
98 |
+
elif phonemes.endswith('i'): # e.g زندگی
|
99 |
+
phonemes += 'je'
|
100 |
+
elif phonemes.endswith('je'): # e.g برای
|
101 |
+
pass
|
102 |
+
elif phonemes.endswith('e'): # e.g مدرسه
|
103 |
+
phonemes += 'je'
|
104 |
+
else:
|
105 |
+
phonemes += 'e'
|
106 |
+
|
107 |
+
phoneme_list.append(phonemes)
|
108 |
+
tag_index += 1 # Move to next tagged word
|
109 |
+
|
110 |
+
phoneme_text = ' '.join(phoneme_list)
|
111 |
+
phoneme_text = re.sub(r"\s+", " ", phoneme_text)
|
112 |
+
|
113 |
+
return phoneme_text
|
114 |
+
|
115 |
+
# FastAPI input model
|
116 |
+
class InputText(BaseModel):
|
117 |
+
text: str
|
118 |
+
|
119 |
+
# Route
|
120 |
+
@app.post("/phonemize")
|
121 |
+
async def phonemize(input_data: InputText):
|
122 |
+
normalized = normalizer.normalize(input_data.text, remove_punct=False)
|
123 |
+
result = process_sentence(normalized, tagger, pattern, punctuation)
|
124 |
+
return {"phonemes": result}
|
pos_tagger.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b89995f2fdd39e6efa897e824ec38824e399366821d1afcb01e81a9160dd9a0d
|
3 |
+
size 19246648
|