Maximofn commited on
Commit
39f7b40
1 Parent(s): d51a666

Good trascription and translation, but text2speech doesn't works good

Browse files
Files changed (3) hide show
  1. lang_list.py +172 -357
  2. requirements.txt +4 -1
  3. translatube.py +108 -60
lang_list.py CHANGED
@@ -1,360 +1,175 @@
1
- # Language dict
2
- language_code_to_name = {
3
- "afr": "Afrikaans",
4
- "amh": "Amharic",
5
- "arb": "Modern Standard Arabic",
6
- "ary": "Moroccan Arabic",
7
- "arz": "Egyptian Arabic",
8
- "asm": "Assamese",
9
- "ast": "Asturian",
10
- "azj": "North Azerbaijani",
11
- "bel": "Belarusian",
12
- "ben": "Bengali",
13
- "bos": "Bosnian",
14
- "bul": "Bulgarian",
15
- "cat": "Catalan",
16
- "ceb": "Cebuano",
17
- "ces": "Czech",
18
- "ckb": "Central Kurdish",
19
- "cmn": "Mandarin Chinese",
20
- "cym": "Welsh",
21
- "dan": "Danish",
22
- "deu": "German",
23
- "ell": "Greek",
24
- "eng": "English",
25
- "est": "Estonian",
26
- "eus": "Basque",
27
- "fin": "Finnish",
28
- "fra": "French",
29
- "gaz": "West Central Oromo",
30
- "gle": "Irish",
31
- "glg": "Galician",
32
- "guj": "Gujarati",
33
- "heb": "Hebrew",
34
- "hin": "Hindi",
35
- "hrv": "Croatian",
36
- "hun": "Hungarian",
37
- "hye": "Armenian",
38
- "ibo": "Igbo",
39
- "ind": "Indonesian",
40
- "isl": "Icelandic",
41
- "ita": "Italian",
42
- "jav": "Javanese",
43
- "jpn": "Japanese",
44
- "kam": "Kamba",
45
- "kan": "Kannada",
46
- "kat": "Georgian",
47
- "kaz": "Kazakh",
48
- "kea": "Kabuverdianu",
49
- "khk": "Halh Mongolian",
50
- "khm": "Khmer",
51
- "kir": "Kyrgyz",
52
- "kor": "Korean",
53
- "lao": "Lao",
54
- "lit": "Lithuanian",
55
- "ltz": "Luxembourgish",
56
- "lug": "Ganda",
57
- "luo": "Luo",
58
- "lvs": "Standard Latvian",
59
- "mai": "Maithili",
60
- "mal": "Malayalam",
61
- "mar": "Marathi",
62
- "mkd": "Macedonian",
63
- "mlt": "Maltese",
64
- "mni": "Meitei",
65
- "mya": "Burmese",
66
- "nld": "Dutch",
67
- "nno": "Norwegian Nynorsk",
68
- "nob": "Norwegian Bokm\u00e5l",
69
- "npi": "Nepali",
70
- "nya": "Nyanja",
71
- "oci": "Occitan",
72
- "ory": "Odia",
73
- "pan": "Punjabi",
74
- "pbt": "Southern Pashto",
75
- "pes": "Western Persian",
76
- "pol": "Polish",
77
- "por": "Portuguese",
78
- "ron": "Romanian",
79
- "rus": "Russian",
80
- "slk": "Slovak",
81
- "slv": "Slovenian",
82
- "sna": "Shona",
83
- "snd": "Sindhi",
84
- "som": "Somali",
85
- "spa": "Spanish",
86
- "srp": "Serbian",
87
- "swe": "Swedish",
88
- "swh": "Swahili",
89
- "tam": "Tamil",
90
- "tel": "Telugu",
91
- "tgk": "Tajik",
92
- "tgl": "Tagalog",
93
- "tha": "Thai",
94
- "tur": "Turkish",
95
- "ukr": "Ukrainian",
96
- "urd": "Urdu",
97
- "uzn": "Northern Uzbek",
98
- "vie": "Vietnamese",
99
- "xho": "Xhosa",
100
- "yor": "Yoruba",
101
- "yue": "Cantonese",
102
- "zlm": "Colloquial Malay",
103
- "zsm": "Standard Malay",
104
- "zul": "Zulu",
105
  }
106
- original_language_code_to_name = {
107
- "afr": "Afrikaans",
108
- "amh": "አማርኛ",
109
- "arb": "العربية",
110
- "ary": "الدارجة المغربية",
111
- "arz": "العامية المصرية",
112
- "asm": "অসমীয়া",
113
- "ast": "Asturianu",
114
- "azj": "Azərbaycanca",
115
- "bel": "Беларуская",
116
- "ben": "বাংলা",
117
- "bos": "Bosanski",
118
- "bul": "Български",
119
- "cat": "Català",
120
- "ceb": "Cebuano",
121
- "ces": "Čeština",
122
- "ckb": "کوردی ناوەندی",
123
- "cmn": "普通话",
124
- "cym": "Cymraeg",
125
- "dan": "Dansk",
126
- "deu": "Deutsch",
127
- "ell": "Ελληνικά",
128
- "eng": "English",
129
- "est": "Eesti",
130
- "eus": "Euskara",
131
- "fin": "Suomi",
132
- "fra": "Français",
133
- "gaz": "Afaan Oromoo",
134
- "gle": "Gaeilge",
135
- "glg": "Galego",
136
- "guj": "ગુજરાતી",
137
- "heb": "עברית",
138
- "hin": "हिंदी",
139
- "hrv": "Hrvatski",
140
- "hun": "Magyar",
141
- "hye": "Հայերեն",
142
- "ibo": "Igbo",
143
- "ind": "Bahasa Indonesia",
144
- "isl": "Íslenska",
145
- "ita": "Italiano",
146
- "jav": "Basa Jawa",
147
- "jpn": "日本語",
148
- "kam": "Kikamba",
149
- "kan": "ಕನ್ನಡ",
150
- "kat": "ქართული",
151
- "kaz": "Қазақ тілі",
152
- "kea": "Kriolu di Kabuverdianu",
153
- "khk": "Халх",
154
- "khm": "ខ្មែរ",
155
- "kir": "Кыргызча",
156
- "kor": "한국어",
157
- "lao": "ລາວ",
158
- "lit": "Lietuvių",
159
- "ltz": "Lëtzebuergesch",
160
- "lug": "Luganda",
161
- "luo": "Dholuo",
162
- "lvs": "Latviešu",
163
- "mai": "मैथिली",
164
- "mal": "മലയാളം",
165
- "mar": "मराठी",
166
- "mkd": "Македонски",
167
- "mlt": "Malti",
168
- "mni": "মৈতৈলোন",
169
- "mya": "မြန်မာ",
170
- "nld": "Nederlands",
171
- "nno": "Nynorsk",
172
- "nob": "Bokmål",
173
- "npi": "नेपाली",
174
- "nya": "Chichewa",
175
- "oci": "Occitan",
176
- "ory": "ଓଡ଼ିଆ",
177
- "pan": "ਪੰਜਾਬੀ",
178
- "pbt": "پښتو",
179
- "pes": "فارسی",
180
- "pol": "Polski",
181
- "por": "Português",
182
- "ron": "Română",
183
- "rus": "Русский",
184
- "slk": "Slovenčina",
185
- "slv": "Slovenščina",
186
- "sna": "ChiShona",
187
- "snd": "سنڌي",
188
- "som": "Soomaali",
189
- "spa": "Español",
190
- "srp": "Српски",
191
- "swe": "Svenska",
192
- "swh": "Kiswahili",
193
- "tam": "தமிழ்",
194
- "tel": "తెలుగు",
195
- "tgk": "Тоҷикӣ",
196
- "tgl": "Tagalog",
197
- "tha": "ไทย",
198
- "tur": "Türkçe",
199
- "ukr": "Українська",
200
- "urd": "اردو",
201
- "uzn": "O‘zbekcha",
202
- "vie": "Tiếng Việt",
203
- "xho": "IsiXhosa",
204
- "yor": "Yorùbá",
205
- "yue": "粤语",
206
- "zlm": "Bahasa Melayu",
207
- "zsm": "Bahasa Melayu",
208
- "zul": "IsiZulu",
209
- }
210
- LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
211
- ORIGINAL_LANGUAGE_NAME_TO_CODE = {v: k for k, v in original_language_code_to_name.items()}
212
-
213
- # Source langs: S2ST / S2TT / ASR don't need source lang
214
- # T2TT / T2ST use this
215
- text_source_language_codes = [
216
- "afr",
217
- "amh",
218
- "arb",
219
- "ary",
220
- "arz",
221
- "asm",
222
- "azj",
223
- "bel",
224
- "ben",
225
- "bos",
226
- "bul",
227
- "cat",
228
- "ceb",
229
- "ces",
230
- "ckb",
231
- "cmn",
232
- "cym",
233
- "dan",
234
- "deu",
235
- "ell",
236
- "eng",
237
- "est",
238
- "eus",
239
- "fin",
240
- "fra",
241
- "gaz",
242
- "gle",
243
- "glg",
244
- "guj",
245
- "heb",
246
- "hin",
247
- "hrv",
248
- "hun",
249
- "hye",
250
- "ibo",
251
- "ind",
252
- "isl",
253
- "ita",
254
- "jav",
255
- "jpn",
256
- "kan",
257
- "kat",
258
- "kaz",
259
- "khk",
260
- "khm",
261
- "kir",
262
- "kor",
263
- "lao",
264
- "lit",
265
- "lug",
266
- "luo",
267
- "lvs",
268
- "mai",
269
- "mal",
270
- "mar",
271
- "mkd",
272
- "mlt",
273
- "mni",
274
- "mya",
275
- "nld",
276
- "nno",
277
- "nob",
278
- "npi",
279
- "nya",
280
- "ory",
281
- "pan",
282
- "pbt",
283
- "pes",
284
- "pol",
285
- "por",
286
- "ron",
287
- "rus",
288
- "slk",
289
- "slv",
290
- "sna",
291
- "snd",
292
- "som",
293
- "spa",
294
- "srp",
295
- "swe",
296
- "swh",
297
- "tam",
298
- "tel",
299
- "tgk",
300
- "tgl",
301
- "tha",
302
- "tur",
303
- "ukr",
304
- "urd",
305
- "uzn",
306
- "vie",
307
- "yor",
308
- "yue",
309
- "zsm",
310
- "zul",
311
- ]
312
- TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
313
 
314
- # Target langs:
315
- # S2ST / T2ST
316
- s2st_target_language_codes = [
317
- "eng",
318
- "arb",
319
- "ben",
320
- "cat",
321
- "ces",
322
- "cmn",
323
- "cym",
324
- "dan",
325
- "deu",
326
- "est",
327
- "fin",
328
- "fra",
329
- "hin",
330
- "ind",
331
- "ita",
332
- "jpn",
333
- "kor",
334
- "mlt",
335
- "nld",
336
- "pes",
337
- "pol",
338
- "por",
339
- "ron",
340
- "rus",
341
- "slk",
342
- "spa",
343
- "swe",
344
- "swh",
345
- "tel",
346
- "tgl",
347
- "tha",
348
- "tur",
349
- "ukr",
350
- "urd",
351
- "uzn",
352
- "vie",
353
- ]
354
- S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
355
- S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES = sorted([original_language_code_to_name[code] for code in s2st_target_language_codes])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
- # S2TT / ASR
358
- S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
359
- # T2TT
360
- T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Languages dict
2
+ LANGUAGE_NAME_TO_CODE = {
3
+ "العربية": "ar_AR",
4
+ "Čeština": "cs_CZ",
5
+ "Deutsch": "de_DE",
6
+ "English": "en_XX",
7
+ "Español": "es_XX",
8
+ "Eesti": "et_EE",
9
+ "Suomi": "fi_FI",
10
+ "Français": "fr_XX",
11
+ "ગુજરાતી": "gu_IN",
12
+ "हिन्दी": "hi_IN",
13
+ "Italiano": "it_IT",
14
+ "日本語": "ja_XX",
15
+ "Қазақ": "kk_KZ",
16
+ "한국어": "ko_KR",
17
+ "Lietuvių": "lt_LT",
18
+ "Latviešu": "lv_LV",
19
+ "ဗမာ": "my_MM",
20
+ "नेपाली": "ne_NP",
21
+ "Nederlands": "nl_XX",
22
+ "Română": "ro_RO",
23
+ "Русский": "ru_RU",
24
+ "සිංහල": "si_LK",
25
+ "Türkçe": "tr_TR",
26
+ "Tiếng Việt": "vi_VN",
27
+ "中文": "zh_CN",
28
+ "Afrikaans": "af_ZA",
29
+ "Azərbaycan": "az_AZ",
30
+ "বাংলা": "bn_IN",
31
+ "فارسی": "fa_IR",
32
+ "עברית": "he_IL",
33
+ "Hrvatski": "hr_HR",
34
+ "Indonesia": "id_ID",
35
+ "ქართული": "ka_GE",
36
+ "ខ្មែរ": "km_KH",
37
+ "Македонски": "mk_MK",
38
+ "മലയാളം": "ml_IN",
39
+ "Монгол": "mn_MN",
40
+ "मराठी": "mr_IN",
41
+ "Polski": "pl_PL",
42
+ "پښتو": "ps_AF",
43
+ "Português": "pt_XX",
44
+ "Svenska": "sv_SE",
45
+ "Kiswahili": "sw_KE",
46
+ "தமிழ்": "ta_IN",
47
+ "తెలుగు": "te_IN",
48
+ "ไทย": "th_TH",
49
+ "Tagalog": "tl_XX",
50
+ "Українська": "uk_UA",
51
+ "اردو": "ur_PK",
52
+ "isiXhosa": "xh_ZA",
53
+ "Galego": "gl_ES",
54
+ "Slovenščina": "sl_SI"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Whisper languages dict
58
+ WHISPER_LANGUAGES = {
59
+ "en": "english",
60
+ "zh": "chinese",
61
+ "de": "german",
62
+ "es": "spanish",
63
+ "ru": "russian",
64
+ "ko": "korean",
65
+ "fr": "french",
66
+ "ja": "japanese",
67
+ "pt": "portuguese",
68
+ "tr": "turkish",
69
+ "pl": "polish",
70
+ "ca": "catalan",
71
+ "nl": "dutch",
72
+ "ar": "arabic",
73
+ "sv": "swedish",
74
+ "it": "italian",
75
+ "id": "indonesian",
76
+ "hi": "hindi",
77
+ "fi": "finnish",
78
+ "vi": "vietnamese",
79
+ "he": "hebrew",
80
+ "uk": "ukrainian",
81
+ "el": "greek",
82
+ "ms": "malay",
83
+ "cs": "czech",
84
+ "ro": "romanian",
85
+ "da": "danish",
86
+ "hu": "hungarian",
87
+ "ta": "tamil",
88
+ "no": "norwegian",
89
+ "th": "thai",
90
+ "ur": "urdu",
91
+ "hr": "croatian",
92
+ "bg": "bulgarian",
93
+ "lt": "lithuanian",
94
+ "la": "latin",
95
+ "mi": "maori",
96
+ "ml": "malayalam",
97
+ "cy": "welsh",
98
+ "sk": "slovak",
99
+ "te": "telugu",
100
+ "fa": "persian",
101
+ "lv": "latvian",
102
+ "bn": "bengali",
103
+ "sr": "serbian",
104
+ "az": "azerbaijani",
105
+ "sl": "slovenian",
106
+ "kn": "kannada",
107
+ "et": "estonian",
108
+ "mk": "macedonian",
109
+ "br": "breton",
110
+ "eu": "basque",
111
+ "is": "icelandic",
112
+ "hy": "armenian",
113
+ "ne": "nepali",
114
+ "mn": "mongolian",
115
+ "bs": "bosnian",
116
+ "kk": "kazakh",
117
+ "sq": "albanian",
118
+ "sw": "swahili",
119
+ "gl": "galician",
120
+ "mr": "marathi",
121
+ "pa": "punjabi",
122
+ "si": "sinhala",
123
+ "km": "khmer",
124
+ "sn": "shona",
125
+ "yo": "yoruba",
126
+ "so": "somali",
127
+ "af": "afrikaans",
128
+ "oc": "occitan",
129
+ "ka": "georgian",
130
+ "be": "belarusian",
131
+ "tg": "tajik",
132
+ "sd": "sindhi",
133
+ "gu": "gujarati",
134
+ "am": "amharic",
135
+ "yi": "yiddish",
136
+ "lo": "lao",
137
+ "uz": "uzbek",
138
+ "fo": "faroese",
139
+ "ht": "haitian creole",
140
+ "ps": "pashto",
141
+ "tk": "turkmen",
142
+ "nn": "nynorsk",
143
+ "mt": "maltese",
144
+ "sa": "sanskrit",
145
+ "lb": "luxembourgish",
146
+ "my": "myanmar",
147
+ "bo": "tibetan",
148
+ "tl": "tagalog",
149
+ "mg": "malagasy",
150
+ "as": "assamese",
151
+ "tt": "tatar",
152
+ "haw": "hawaiian",
153
+ "ln": "lingala",
154
+ "ha": "hausa",
155
+ "ba": "bashkir",
156
+ "jw": "javanese",
157
+ "su": "sundanese",
158
+ }
159
 
160
+ def union_language_dict():
161
+ # Create a dictionary to store the language codes
162
+ language_dict = {}
163
+ # Iterate over the LANGUAGE_NAME_TO_CODE dictionary
164
+ for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
165
+ # Extract the language code (the first two characters before the underscore)
166
+ lang_code = language_code.split('_')[0].lower()
167
+
168
+ # Check if the language code is present in WHISPER_LANGUAGES
169
+ if lang_code in WHISPER_LANGUAGES:
170
+ # Construct the entry for the resulting dictionary
171
+ language_dict[language_name] = {
172
+ "transcriber": lang_code,
173
+ "translator": language_code
174
+ }
175
+ return language_dict
requirements.txt CHANGED
@@ -10,4 +10,7 @@ twitch-dl
10
  pytube
11
  pyperclip
12
  transformers
13
- git+https://github.com/openai/whisper.git
 
 
 
 
10
  pytube
11
  pyperclip
12
  transformers
13
+ git+https://github.com/openai/whisper.git
14
+ sentencepiece
15
+ protobuf
16
+ git+https://github.com/suno-ai/bark.git
translatube.py CHANGED
@@ -4,16 +4,21 @@ import urllib.parse as urlparse
4
  from pytube import YouTube
5
  import re
6
  import subprocess
7
- import time
8
- from lang_list import ORIGINAL_LANGUAGE_NAME_TO_CODE, S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES
9
-
10
  import torch
11
  import whisper
 
 
 
12
 
13
  # get device
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
  # device = torch.device("cpu")
16
- model = whisper.load_model("large-v2", device=device)
 
 
17
 
18
  YOUTUBE = "youtube"
19
  TWITCH = "twitch"
@@ -25,22 +30,19 @@ def copy_url_from_clipboard():
25
  def clear_video_url():
26
  visible = False
27
  image = gr.Image(visible=visible, scale=1)
28
- source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True)
29
- target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True)
30
- get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=visible)
31
- transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible)
32
  original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False)
33
  original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=visible)
34
  original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible)
35
  translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible)
36
- transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible)
37
  return (
38
  "",
39
  image,
40
  source_languaje,
41
  target_languaje,
42
- get_audio_button,
43
- transcribe_audio_button,
44
  original_audio,
45
  original_audio_transcribed,
46
  translated_audio,
@@ -66,14 +68,13 @@ def get_youtube_video_id(url):
66
  return None
67
 
68
  def is_valid_url(url):
69
- source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True)
70
- target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True)
71
- get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=True)
72
  original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=True, interactive=False)
73
  original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=True)
74
  original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=True)
75
  translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=True)
76
- transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=True)
77
  if "youtube" in url.lower() or "youtu.be" in url.lower():
78
  thumbnail = get_youtube_video_id(url)
79
  if thumbnail:
@@ -81,12 +82,11 @@ def is_valid_url(url):
81
  gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
82
  source_languaje,
83
  target_languaje,
84
- get_audio_button,
85
  gr.Textbox(value=YOUTUBE, label="Stream page", elem_id="stream_page", visible=False),
86
  original_audio,
87
  original_audio_transcribed,
88
  translated_audio,
89
- transcribe_audio_button,
90
  original_audio_translated,
91
  )
92
  else:
@@ -94,12 +94,11 @@ def is_valid_url(url):
94
  gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
95
  source_languaje,
96
  target_languaje,
97
- get_audio_button,
98
  gr.Textbox(value=YOUTUBE, label="Stream page", elem_id="stream_page", visible=False),
99
  original_audio,
100
  original_audio_transcribed,
101
  translated_audio,
102
- transcribe_audio_button,
103
  original_audio_translated,
104
  )
105
  elif "twitch" in url.lower() or "twitch.tv" in url.lower():
@@ -107,36 +106,33 @@ def is_valid_url(url):
107
  gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
108
  source_languaje,
109
  target_languaje,
110
- get_audio_button,
111
  gr.Textbox(value=TWITCH, label="Stream page", elem_id="stream_page", visible=False),
112
  original_audio,
113
  original_audio_transcribed,
114
  translated_audio,
115
- transcribe_audio_button,
116
  original_audio_translated,
117
  )
118
  else:
119
  visible = False
120
  image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
121
- source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True)
122
- target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True)
123
- get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=visible)
124
  stream_page = gr.Textbox(value=ERROR, label="Stream page", elem_id="stream_page", visible=visible)
125
  original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False)
126
  original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=visible)
127
  original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible)
128
  translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible)
129
- transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible)
130
  return (
131
  image,
132
  source_languaje,
133
  target_languaje,
134
- get_audio_button,
135
  stream_page,
136
  original_audio,
137
  original_audio_transcribed,
138
  translated_audio,
139
- transcribe_audio_button,
140
  original_audio_translated,
141
  )
142
 
@@ -175,16 +171,20 @@ def get_audio_from_video(url, stream_page):
175
  gr.Textbox(value=filename, label="Stream page", elem_id="stream_page", visible=False)
176
  )
177
 
178
- def trascribe_audio(audio_path):
 
 
 
 
179
  audio = whisper.load_audio(audio_path)
180
  audio = whisper.pad_or_trim(audio)
181
 
182
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
183
-
184
- _, probs = model.detect_language(mel)
185
 
186
- options = whisper.DecodingOptions(fp16 = False)
187
- result = whisper.decode(model, mel, options)
 
188
 
189
  # Save the result to a file
190
  filename = "result.txt"
@@ -192,7 +192,16 @@ def trascribe_audio(audio_path):
192
  f.write(result.text)
193
 
194
  # Remove audio file
195
- # subprocess.run(["rm", audio_path])
 
 
 
 
 
 
 
 
 
196
 
197
  return (
198
  result.text,
@@ -200,18 +209,42 @@ def trascribe_audio(audio_path):
200
  )
201
 
202
  def translate(original_audio_transcribed_path, source_languaje, target_languaje):
203
- # Translate
 
 
 
 
 
 
 
 
204
  with open(original_audio_transcribed_path, "r") as f:
205
- text = f.read()
206
- translated = text
 
 
 
 
 
 
 
207
 
208
  # Save the result to a file
209
  filename = "translated_text.txt"
210
  with open(filename, "w") as f:
211
- f.write(text)
212
 
213
- # Remove audio file
214
- # subprocess.run(["rm", original_audio_transcribed_path])
 
 
 
 
 
 
 
 
 
215
 
216
  return (
217
  translated,
@@ -219,14 +252,26 @@ def translate(original_audio_transcribed_path, source_languaje, target_languaje)
219
  )
220
 
221
  def tex2speech(original_audio_translated_path):
222
- pass
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  with gr.Blocks() as demo:
 
225
  with gr.Row(variant="panel"):
226
  url_textbox = gr.Textbox(placeholder="Add video URL here", label="Video URL", elem_id="video_url", scale=1, interactive=True)
227
  copy_button = gr.Button(size="sm", icon="icons/copy.svg", value="", min_width="10px", scale=0)
228
  delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="", min_width="10px", scale=0)
229
- copy_button.click(fn=copy_url_from_clipboard, outputs=url_textbox)
230
 
231
  stream_page = gr.Textbox(label="Stream page", elem_id="stream_page", visible=False)
232
  visible = False
@@ -234,11 +279,10 @@ with gr.Blocks() as demo:
234
  image = gr.Image(visible=visible, scale=1)
235
  with gr.Column():
236
  with gr.Row():
237
- source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True)
238
- target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True)
239
  with gr.Row():
240
- get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=visible)
241
- transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible)
242
 
243
  original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False)
244
  original_audio_path = gr.Textbox(label="Stream page", elem_id="stream_page", visible=False)
@@ -247,40 +291,44 @@ with gr.Blocks() as demo:
247
  original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible)
248
  original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=False)
249
  translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible)
250
- url_textbox.change(
251
- fn=is_valid_url,
252
- inputs=url_textbox,
 
 
 
253
  outputs=[
 
254
  image,
255
  source_languaje,
256
  target_languaje,
257
- get_audio_button,
258
- stream_page,
259
  original_audio,
260
  original_audio_transcribed,
261
  translated_audio,
262
- transcribe_audio_button,
263
  original_audio_translated,
264
  ]
265
  )
266
- delete_button.click(
267
- fn=clear_video_url,
 
268
  outputs=[
269
- url_textbox,
270
  image,
271
  source_languaje,
272
  target_languaje,
273
- get_audio_button,
274
- transcribe_audio_button,
275
  original_audio,
276
  original_audio_transcribed,
277
  translated_audio,
278
  original_audio_translated,
279
  ]
280
  )
281
- get_audio_button.click(fn=get_audio_from_video, inputs=[url_textbox, stream_page], outputs=[original_audio, original_audio_path])
282
- original_audio.change(fn=trascribe_audio, inputs=original_audio_path, outputs=[original_audio_transcribed, original_audio_transcribed_path])
283
  original_audio_transcribed.change(fn=translate, inputs=[original_audio_transcribed_path, source_languaje, target_languaje], outputs=[original_audio_translated, original_audio_translated_path])
284
- original_audio_translated.change(fn=tex2speech, inputs=original_audio_translated_path, outputs=translated_audio)
 
 
285
 
286
  demo.launch()
 
4
  from pytube import YouTube
5
  import re
6
  import subprocess
7
+ import torch
8
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, pipeline
9
+ from lang_list import union_language_dict
10
  import torch
11
  import whisper
12
+ from bark import SAMPLE_RATE, generate_audio, preload_models
13
+ from scipy.io.wavfile import write as write_wav
14
+ import gc
15
 
16
  # get device
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
  # device = torch.device("cpu")
19
+
20
+ # Create a dictionary to store the language codes
21
+ language_dict = union_language_dict()
22
 
23
  YOUTUBE = "youtube"
24
  TWITCH = "twitch"
 
30
  def clear_video_url():
31
  visible = False
32
  image = gr.Image(visible=visible, scale=1)
33
+ source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
34
+ target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
35
+ translate_button = gr.Button(size="lg", value="translate", min_width="10px", scale=0, visible=visible)
 
36
  original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False)
37
  original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=visible)
38
  original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible)
39
  translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible)
 
40
  return (
41
  "",
42
  image,
43
  source_languaje,
44
  target_languaje,
45
+ translate_button,
 
46
  original_audio,
47
  original_audio_transcribed,
48
  translated_audio,
 
68
  return None
69
 
70
  def is_valid_url(url):
71
+ source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
72
+ target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
73
+ translate_button = gr.Button(size="lg", value="translate", min_width="10px", scale=0, visible=True)
74
  original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=True, interactive=False)
75
  original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=True)
76
  original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=True)
77
  translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=True)
 
78
  if "youtube" in url.lower() or "youtu.be" in url.lower():
79
  thumbnail = get_youtube_video_id(url)
80
  if thumbnail:
 
82
  gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
83
  source_languaje,
84
  target_languaje,
85
+ translate_button,
86
  gr.Textbox(value=YOUTUBE, label="Stream page", elem_id="stream_page", visible=False),
87
  original_audio,
88
  original_audio_transcribed,
89
  translated_audio,
 
90
  original_audio_translated,
91
  )
92
  else:
 
94
  gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
95
  source_languaje,
96
  target_languaje,
97
+ translate_button,
98
  gr.Textbox(value=YOUTUBE, label="Stream page", elem_id="stream_page", visible=False),
99
  original_audio,
100
  original_audio_transcribed,
101
  translated_audio,
 
102
  original_audio_translated,
103
  )
104
  elif "twitch" in url.lower() or "twitch.tv" in url.lower():
 
106
  gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
107
  source_languaje,
108
  target_languaje,
109
+ translate_button,
110
  gr.Textbox(value=TWITCH, label="Stream page", elem_id="stream_page", visible=False),
111
  original_audio,
112
  original_audio_transcribed,
113
  translated_audio,
 
114
  original_audio_translated,
115
  )
116
  else:
117
  visible = False
118
  image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
119
+ source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
120
+ target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
121
+ translate_button = gr.Button(size="lg", value="translate", min_width="10px", scale=0, visible=visible)
122
  stream_page = gr.Textbox(value=ERROR, label="Stream page", elem_id="stream_page", visible=visible)
123
  original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False)
124
  original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=visible)
125
  original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible)
126
  translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible)
 
127
  return (
128
  image,
129
  source_languaje,
130
  target_languaje,
131
+ translate_button,
132
  stream_page,
133
  original_audio,
134
  original_audio_transcribed,
135
  translated_audio,
 
136
  original_audio_translated,
137
  )
138
 
 
171
  gr.Textbox(value=filename, label="Stream page", elem_id="stream_page", visible=False)
172
  )
173
 
174
+ def trascribe_audio(audio_path, source_lang):
175
+ # Load the model
176
+ trascribe_model = whisper.load_model("large-v2", device=device)
177
+
178
+ # load audio and pad/trim it to fit 30 seconds
179
  audio = whisper.load_audio(audio_path)
180
  audio = whisper.pad_or_trim(audio)
181
 
182
+ # make log-Mel spectrogram and move to the same device as the model
183
+ mel = whisper.log_mel_spectrogram(audio).to(trascribe_model.device)
 
184
 
185
+ # Decode the result
186
+ options = whisper.DecodingOptions(fp16 = False, language = language_dict[source_lang]['transcriber'])
187
+ result = whisper.decode(trascribe_model, mel, options)
188
 
189
  # Save the result to a file
190
  filename = "result.txt"
 
192
  f.write(result.text)
193
 
194
  # Remove audio file
195
+ subprocess.run(["rm", audio_path])
196
+
197
+ # free gpu memory
198
+ del trascribe_model
199
+ del audio
200
+ del mel
201
+ del options
202
+ if device == "cuda":
203
+ torch.cuda.empty_cache()
204
+ gc.collect()
205
 
206
  return (
207
  result.text,
 
209
  )
210
 
211
  def translate(original_audio_transcribed_path, source_languaje, target_languaje):
212
+ # model
213
+ translate_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt").to(device)
214
+ translate_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
215
+
216
+ # Get source and target languaje codes
217
+ source_languaje_code = language_dict[source_languaje]["translator"]
218
+ target_languaje_code = language_dict[target_languaje]["translator"]
219
+
220
+ # Get the transcribed text
221
  with open(original_audio_transcribed_path, "r") as f:
222
+ transcribed_text = f.read()
223
+
224
+ # Translate the text
225
+ encoded = translate_tokenizer(transcribed_text, return_tensors="pt").to(device)
226
+ generated_tokens = translate_model.generate(
227
+ **encoded,
228
+ forced_bos_token_id=translate_tokenizer.lang_code_to_id[target_languaje_code]
229
+ )
230
+ translated = translate_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
231
 
232
  # Save the result to a file
233
  filename = "translated_text.txt"
234
  with open(filename, "w") as f:
235
+ f.write(translated)
236
 
237
+ # Remove transcribed file
238
+ subprocess.run(["rm", original_audio_transcribed_path])
239
+
240
+ # free gpu memory
241
+ del translate_model
242
+ del translate_tokenizer
243
+ del encoded
244
+ del generated_tokens
245
+ if device == "cuda":
246
+ torch.cuda.empty_cache()
247
+ gc.collect()
248
 
249
  return (
250
  translated,
 
252
  )
253
 
254
  def tex2speech(original_audio_translated_path):
255
+ with open(original_audio_translated_path, "r") as f:
256
+ translated_text = f.read()
257
+
258
+ preload_models()
259
+ speech_array = generate_audio(translated_text, history_prompt="v2/es_speaker_1")
260
+
261
+ translated_audio_path = "translated_audio.wav"
262
+ write_wav(translated_audio_path, SAMPLE_RATE, speech_array)
263
+
264
+ return translated_audio_path
265
+
266
+ def delete_translated_audio(translated_audio_path):
267
+ subprocess.run(["rm", translated_audio_path])
268
 
269
  with gr.Blocks() as demo:
270
+ # Layout
271
  with gr.Row(variant="panel"):
272
  url_textbox = gr.Textbox(placeholder="Add video URL here", label="Video URL", elem_id="video_url", scale=1, interactive=True)
273
  copy_button = gr.Button(size="sm", icon="icons/copy.svg", value="", min_width="10px", scale=0)
274
  delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="", min_width="10px", scale=0)
 
275
 
276
  stream_page = gr.Textbox(label="Stream page", elem_id="stream_page", visible=False)
277
  visible = False
 
279
  image = gr.Image(visible=visible, scale=1)
280
  with gr.Column():
281
  with gr.Row():
282
+ source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
283
+ target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
284
  with gr.Row():
285
+ translate_button = gr.Button(size="lg", value="translate", min_width="10px", scale=0, visible=visible)
 
286
 
287
  original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False)
288
  original_audio_path = gr.Textbox(label="Stream page", elem_id="stream_page", visible=False)
 
291
  original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible)
292
  original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=False)
293
  translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible)
294
+ translated_audio_translated_path = gr.Textbox(label="translated audio translated", elem_id="translated_audio_translated", visible=False)
295
+
296
+ # Events
297
+ copy_button.click(fn=copy_url_from_clipboard, outputs=url_textbox)
298
+ delete_button.click(
299
+ fn=clear_video_url,
300
  outputs=[
301
+ url_textbox,
302
  image,
303
  source_languaje,
304
  target_languaje,
305
+ translate_button,
 
306
  original_audio,
307
  original_audio_transcribed,
308
  translated_audio,
 
309
  original_audio_translated,
310
  ]
311
  )
312
+ url_textbox.change(
313
+ fn=is_valid_url,
314
+ inputs=url_textbox,
315
  outputs=[
 
316
  image,
317
  source_languaje,
318
  target_languaje,
319
+ translate_button,
320
+ stream_page,
321
  original_audio,
322
  original_audio_transcribed,
323
  translated_audio,
324
  original_audio_translated,
325
  ]
326
  )
327
+ translate_button.click(fn=get_audio_from_video, inputs=[url_textbox, stream_page], outputs=[original_audio, original_audio_path])
328
+ original_audio.change(fn=trascribe_audio, inputs=[original_audio_path, source_languaje], outputs=[original_audio_transcribed, original_audio_transcribed_path])
329
  original_audio_transcribed.change(fn=translate, inputs=[original_audio_transcribed_path, source_languaje, target_languaje], outputs=[original_audio_translated, original_audio_translated_path])
330
+ # original_audio_translated.change(fn=tex2speech, inputs=original_audio_translated_path, outputs=translated_audio)
331
+ # translated_audio.change(fn=delete_translated_audio, inputs=translated_audio)
332
+
333
 
334
  demo.launch()