File size: 1,662 Bytes
4a14093
 
 
 
21a9521
 
4a14093
 
 
 
 
 
a6b596f
4a14093
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd6bcbd
4a14093
bd6bcbd
4a14093
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import streamlit as st
from transformers import GPT2TokenizerFast, AutoModelForCausalLM
from arabert.preprocess import ArabertPreprocessor

# Load model and tokenizer and the model

model_name = "malmarjeh/gpt2"
tokenizer = GPT2TokenizerFast.from_pretrained("aubmindlab/aragpt2-base")
model = AutoModelForCausalLM.from_pretrained(model_name)
preprocessor = ArabertPreprocessor(model_name=model_name)

# Streamlit UI
st.title('Arabic Text Summarizer | By M.Araby')
text = st.text_area("Paste your Arabic text here:")

if st.button('Summarize'):
    if text:
        # Preprocess and tokenize input text
        processed_text = preprocessor.preprocess(text)
        formatted_text = '\n النص: ' + processed_text + ' \n الملخص: \n '
        tokenizer.add_special_tokens({'pad_token': '<pad>'})
        tokens = tokenizer.batch_encode_plus([formatted_text], return_tensors='pt', padding='max_length',
                                             max_length=150)

        # Generate summary
        output = model.generate(
            input_ids=tokens['input_ids'],
            repetition_penalty=2.0,
            num_beams=5,
            max_length=600,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
        )

        # Decode and display the summarized text
        result = tokenizer.decode(output[0][150:], skip_special_tokens=True).strip()
        st.subheader("Original Text Input")
        st.write(text)
        st.subheader("Summarized Text Idea")
        st.write(result)
    else:
        st.warning("Please enter Arabic text to summarize.")