storresbusquets commited on
Commit
31eb124
1 Parent(s): 24a4fff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -44
app.py CHANGED
@@ -86,49 +86,34 @@ class GradioInference:
86
  progress(0.40, desc="Summarizing")
87
 
88
  # Perform summarization on the transcription
89
- # transcription_summary = self.summarizer(
90
- # results["text"], max_length=150, min_length=30, do_sample=False
91
- # )
92
 
93
- #### Prueba
94
- # WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
95
-
96
- # input_ids_sum = self.tokenizer(
97
- # [WHITESPACE_HANDLER(results["text"])],
98
- # return_tensors="pt",
99
- # padding="max_length",
100
- # truncation=True,
101
- # max_length=512
102
- # )["input_ids"]
103
-
104
- # output_ids_sum = self.model.generate(
105
- # input_ids=input_ids_sum,
106
- # max_length=130,
107
- # no_repeat_ngram_size=2,
108
- # num_beams=4
109
- # )[0]
110
 
111
- # summary = self.tokenizer.decode(
112
- # output_ids_sum,
113
- # skip_special_tokens=True,
114
- # clean_up_tokenization_spaces=False
115
- # )
116
- #### Fin prueba
 
117
 
118
- ### Prueba con LLM ###
119
-
120
- template = """
121
- [INST] <<SYS>>
122
- You are a helpful, respectful and honest assistant that performs summaries of text. Write a concise summary of the following text.
123
- <</SYS>>
124
- {text}[/INST]
125
- """
126
 
127
- prompt = PromptTemplate(template=template, input_variables=["text"])
128
- llm_chain = LLMChain(prompt=prompt, llm=self.llm)
129
- summary2 = llm_chain.run(results["text"])
130
-
131
- ### Fin prueba LLM ###
 
132
 
133
  progress(0.60, desc="Extracting Keywords")
134
 
@@ -171,8 +156,7 @@ class GradioInference:
171
  if lang == "english" or lang == "none":
172
  return (
173
  results["text"],
174
- summary2,
175
- # transcription_summary[0]["summary_text"],
176
  formatted_keywords,
177
  formatted_sentiment,
178
  wordcloud_image,
@@ -180,7 +164,7 @@ class GradioInference:
180
  else:
181
  return (
182
  results["text"],
183
- summary2,
184
  formatted_keywords,
185
  formatted_sentiment,
186
  wordcloud_image,
@@ -229,7 +213,7 @@ class GradioInference:
229
  results["text"], max_length=150, min_length=30, do_sample=False
230
  )
231
 
232
- #### Prueba
233
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
234
 
235
  input_ids_sum = self.tokenizer(
@@ -252,7 +236,7 @@ class GradioInference:
252
  skip_special_tokens=True,
253
  clean_up_tokenization_spaces=False
254
  )
255
- #### Fin prueba
256
 
257
  progress(0.50, desc="Extracting Keywords")
258
 
 
86
  progress(0.40, desc="Summarizing")
87
 
88
  # Perform summarization on the transcription
89
+ transcription_summary = self.summarizer(
90
+ results["text"], max_length=150, min_length=30, do_sample=False
91
+ )
92
 
93
+ #### Resumen multilingue
94
+ WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ input_ids_sum = self.tokenizer(
97
+ [WHITESPACE_HANDLER(results["text"])],
98
+ return_tensors="pt",
99
+ padding="max_length",
100
+ truncation=True,
101
+ max_length=512
102
+ )["input_ids"]
103
 
104
+ output_ids_sum = self.model.generate(
105
+ input_ids=input_ids_sum,
106
+ max_length=130,
107
+ no_repeat_ngram_size=2,
108
+ num_beams=4
109
+ )[0]
 
 
110
 
111
+ summary = self.tokenizer.decode(
112
+ output_ids_sum,
113
+ skip_special_tokens=True,
114
+ clean_up_tokenization_spaces=False
115
+ )
116
+ #### Fin resumen multilingue
117
 
118
  progress(0.60, desc="Extracting Keywords")
119
 
 
156
  if lang == "english" or lang == "none":
157
  return (
158
  results["text"],
159
+ transcription_summary[0]["summary_text"],
 
160
  formatted_keywords,
161
  formatted_sentiment,
162
  wordcloud_image,
 
164
  else:
165
  return (
166
  results["text"],
167
+ summary,
168
  formatted_keywords,
169
  formatted_sentiment,
170
  wordcloud_image,
 
213
  results["text"], max_length=150, min_length=30, do_sample=False
214
  )
215
 
216
+ #### Resumen multilingue
217
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
218
 
219
  input_ids_sum = self.tokenizer(
 
236
  skip_special_tokens=True,
237
  clean_up_tokenization_spaces=False
238
  )
239
+ #### Fin resumen multilingue
240
 
241
  progress(0.50, desc="Extracting Keywords")
242