Spaces:

Tiju1996
/

English-Hindi_Cross_Lingual_Summarization

Runtime error

App Files Files Community

Tiju1996 commited on Apr 18, 2023

Commit

0cbcb1a

1 Parent(s): 465c6db

Create app.py

Browse files

Files changed (1) hide show

app.py +65 -0

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import gradio as gr
+import re
+from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, pipeline
+model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
+tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
+pipe2 = pipeline('summarization', model="Tiju1996/conversation-summ")
+def process_text(text):
+    # Remove all reference citations
+    text = re.sub(r'\[[0-9]*\]', '', text)
+    # Remove all footnotes
+    text = re.sub(r'\[\d*\]', '', text)
+    # Remove all images
+    text = re.sub(r'(\[[^\]]*\])?\[[^\]]*\]', '', text)
+    # Remove all non-string characters
+    text = re.sub(r'[^\x00-\x7F]+', '', text)
+    # Remove all emojis
+    emoji_pattern = re.compile("["
+                               u"\U0001F600-\U0001F64F"  # emoticons
+                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                               u"\U00002702-\U000027B0"
+                               u"\U000024C2-\U0001F251"
+                               "]+", flags=re.UNICODE)
+    text = emoji_pattern.sub(r'', text)
+    # Remove all HTML tags
+    text = re.sub(r'<.*?>', '', text)
+    #Remove all hyperlinks from the text
+    text=re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text)
+    #Remove all url from the text
+    text=re.sub(r'http\S+', '', text)
+    # Strip whitespace
+    text = text.strip(" ")
+    return text
+def summarize(article_en_raw):
+    article_en=process_text(article_en_raw)
+    summary_en=pipe2(article_en)
+    model_inputs = tokenizer(summary_en[0]['summary_text'], return_tensors="pt")
+    generated_tokens = model.generate(
+        **model_inputs,
+        forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
+    )
+    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+    return  translation[0]
+input_text = gr.inputs.Textbox(lines=20, label="Enter text document to be summarized")
+output_text = gr.outputs.Textbox(label="Summarized Text")
+gr.Interface(fn=summarize, inputs=input_text, outputs=output_text, title="Text Summarization App",
+             description="Enter a text document and get its summarized version.").launch("English-Hindi-Summary",share=True)