saeedzou commited on
Commit
32064f6
·
verified ·
1 Parent(s): 7be148a

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +29 -0
  2. phonemizer.py +124 -0
  3. pos_tagger.model +3 -0
Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as the base image
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ espeak-ng \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Clone ParsNorm repository and install it
14
+ RUN git clone https://github.com/saeedzou/ParsNorm.git \
15
+ && cd ParsNorm \
16
+ && pip install -e . \
17
+ && pip install -r requirements.txt \
18
+ && pip install fastapi uvicorn
19
+
20
+ COPY pos_tagger.model .
21
+
22
+ # Copy your Python script into the container
23
+ COPY phonemizer.py .
24
+
25
+ # Expose the port FastAPI will run on
26
+ EXPOSE 7860
27
+
28
+ # Run the FastAPI app
29
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
phonemizer.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import re
3
+ import string
4
+ from fastapi import FastAPI, Request
5
+ from pydantic import BaseModel
6
+ from hazm import POSTagger, word_tokenize
7
+ from parsnorm import ParsNorm
8
+
9
+ app = FastAPI()
10
+ # Setup
11
+ normalizer = ParsNorm(remove_diacritics=False)
12
+ tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present
13
+ punctuation = string.punctuation + "؟:؛»«،"
14
+ pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"
15
+
16
+ ambiguity_dict = {
17
+ 'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
18
+ 'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
19
+ 'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
20
+ 'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
21
+ 'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
22
+ 'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
23
+ 'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
24
+ 'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
25
+ 'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
26
+ 'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
27
+ 'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
28
+ 'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
29
+ 'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
30
+ 'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
31
+ 'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
32
+ 'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
33
+ 'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
34
+ 'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
35
+ 'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
36
+ 'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
37
+ 'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
38
+ 'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
39
+ 'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
40
+ 'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
41
+ 'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
42
+ 'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
43
+ 'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
44
+ 'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
45
+ 'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
46
+ 'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
47
+ 'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
48
+ 'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
49
+ 'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
50
+ 'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
51
+ }
52
+ def get_phoneme_for_pos(entry, target_pos):
53
+ for i, pos_tag in enumerate(entry['pos']):
54
+ if pos_tag == target_pos:
55
+ return entry['phonemes'][i]
56
+ return None # Return None if target POS tag is not found
57
+
58
+
59
+ def get_phonemes(word):
60
+ """Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
61
+ cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
62
+ try:
63
+ # Run the subprocess with 'latin1' encoding to handle special characters
64
+ result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
65
+ # Remove apostrophes from phonemes and strip any unwanted spaces or newlines
66
+ return result.stdout.strip()
67
+ except UnicodeDecodeError as e:
68
+ print(f"UnicodeDecodeError: {e}\n{word}")
69
+ return None # Or handle the error appropriately
70
+
71
+
72
+ def process_sentence(sentence, tagger, pattern, punctuation):
73
+ sentence = re.sub(pattern, r' ', sentence)
74
+ """Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
75
+ words = word_tokenize(sentence)
76
+ tagged_words = tagger.tag(words)
77
+
78
+ phoneme_list = []
79
+ tag_index = 0 # Track the index of words that get POS tags
80
+
81
+ for word in words:
82
+ if word in punctuation:
83
+ phoneme_list.append(word)
84
+ else: # If it's a word, process normally
85
+ word = word.replace('_', ' ').replace('\u200c', ' ')
86
+ phonemes = get_phonemes(word)
87
+ kaamel_phonemes = ambiguity_dict.get(word)
88
+ if kaamel_phonemes:
89
+ if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
90
+ phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))
91
+
92
+ # If word has Ezafe (EZ tag), modify phoneme
93
+ if 'EZ' in tagged_words[tag_index][1]:
94
+ if phonemes.endswith('jeː'):
95
+ pass
96
+ elif phonemes.endswith('ː'): # Ends in long vowel
97
+ phonemes += 'je'
98
+ elif phonemes.endswith('i'): # e.g زندگی
99
+ phonemes += 'je'
100
+ elif phonemes.endswith('je'): # e.g برای
101
+ pass
102
+ elif phonemes.endswith('e'): # e.g مدرسه
103
+ phonemes += 'je'
104
+ else:
105
+ phonemes += 'e'
106
+
107
+ phoneme_list.append(phonemes)
108
+ tag_index += 1 # Move to next tagged word
109
+
110
+ phoneme_text = ' '.join(phoneme_list)
111
+ phoneme_text = re.sub(r"\s+", " ", phoneme_text)
112
+
113
+ return phoneme_text
114
+
115
+ # FastAPI input model
116
+ class InputText(BaseModel):
117
+ text: str
118
+
119
+ # Route
120
+ @app.post("/phonemize")
121
+ async def phonemize(input_data: InputText):
122
+ normalized = normalizer.normalize(input_data.text, remove_punct=False)
123
+ result = process_sentence(normalized, tagger, pattern, punctuation)
124
+ return {"phonemes": result}
pos_tagger.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b89995f2fdd39e6efa897e824ec38824e399366821d1afcb01e81a9160dd9a0d
3
+ size 19246648