Tiju1996 commited on
Commit
0cbcb1a
·
1 Parent(s): 465c6db

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, pipeline
4
+
5
+ model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
6
+ tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
7
+ pipe2 = pipeline('summarization', model="Tiju1996/conversation-summ")
8
+
9
+
10
+
11
+ def process_text(text):
12
+ # Remove all reference citations
13
+ text = re.sub(r'\[[0-9]*\]', '', text)
14
+
15
+ # Remove all footnotes
16
+ text = re.sub(r'\[\d*\]', '', text)
17
+
18
+ # Remove all images
19
+ text = re.sub(r'(\[[^\]]*\])?\[[^\]]*\]', '', text)
20
+
21
+ # Remove all non-string characters
22
+ text = re.sub(r'[^\x00-\x7F]+', '', text)
23
+
24
+ # Remove all emojis
25
+ emoji_pattern = re.compile("["
26
+ u"\U0001F600-\U0001F64F" # emoticons
27
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
28
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
29
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
30
+ u"\U00002702-\U000027B0"
31
+ u"\U000024C2-\U0001F251"
32
+ "]+", flags=re.UNICODE)
33
+ text = emoji_pattern.sub(r'', text)
34
+
35
+ # Remove all HTML tags
36
+ text = re.sub(r'<.*?>', '', text)
37
+
38
+ #Remove all hyperlinks from the text
39
+ text=re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text)
40
+
41
+ #Remove all url from the text
42
+ text=re.sub(r'http\S+', '', text)
43
+
44
+ # Strip whitespace
45
+ text = text.strip(" ")
46
+
47
+ return text
48
+
49
+
50
+ def summarize(article_en_raw):
51
+ article_en=process_text(article_en_raw)
52
+ summary_en=pipe2(article_en)
53
+ model_inputs = tokenizer(summary_en[0]['summary_text'], return_tensors="pt")
54
+ generated_tokens = model.generate(
55
+ **model_inputs,
56
+ forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
57
+ )
58
+ translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
59
+ return translation[0]
60
+
61
+ input_text = gr.inputs.Textbox(lines=20, label="Enter text document to be summarized")
62
+ output_text = gr.outputs.Textbox(label="Summarized Text")
63
+
64
+ gr.Interface(fn=summarize, inputs=input_text, outputs=output_text, title="Text Summarization App",
65
+ description="Enter a text document and get its summarized version.").launch("English-Hindi-Summary",share=True)