Spaces:
Build error
Build error
File size: 4,680 Bytes
02ac4f2 fb6f6f9 02ac4f2 7f3cfe8 44f0002 7f3cfe8 52022f7 7f3cfe8 52022f7 7f3cfe8 48029cd 7f3cfe8 48029cd 7f3cfe8 48029cd 94ab8d9 7f3cfe8 48029cd 7f3cfe8 2f9e880 7f3cfe8 2f9e880 48029cd 2f9e880 48029cd 7f3cfe8 2f9e880 a52f9ec 2f9e880 7f3cfe8 2f9e880 7f3cfe8 2f9e880 7f3cfe8 4158e04 7f3cfe8 02ac4f2 48029cd 02ac4f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import time
import pandas as pd
import torch
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from keyphrase_vectorizers import KeyphraseCountVectorizer
from transformers import T5ForConditionalGeneration,T5Tokenizer
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from huggingface_hub import snapshot_download, HfFolder
import streamlit as st
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HfFolder.save_token(st.secrets["hf-auth-token"])
@st.cache(allow_output_mutation=True)
def load_model():
# Load KeyBert Model
tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
kw_extractor = KeyBERT(tmp_model)
# Load T5 for Paraphrasing
t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = t5_model.to(device)
return kw_extractor, t5_model, t5_tokenizer
kw_extractor, t5_model, t5_tokenizer = load_model()
@st.cache()
def get_keybert_results_with_vectorizer(text, number_of_results=20):
keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
return keywords
@st.cache()
def t5_paraphraser(text, number_of_results=5):
text = "paraphrase: " + text + " </s>"
max_len = 2048
encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
beam_outputs = t5_model.generate(
input_ids=input_ids, attention_mask=attention_masks,
do_sample=True,
max_length=2048,
top_k=50,
top_p=0.95,
early_stopping=True,
num_return_sequences=number_of_results
)
final_outputs =[]
for beam_output in beam_outputs:
sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
final_outputs.append(sent)
return final_outputs
#### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
def extract_paraphrased_sentences(article):
start1 = time.time()
with st.spinner('Extraction Keywords from Original Document...'):
original_keywords = [(i[0], i[1]) for i in get_keybert_results_with_vectorizer(article)]
article_sentences = sent_tokenize(article)
target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
start2 = time.time()
with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
t5_paraphrasing_keywords = []
for sent in target_sentences:
### T5
t5_paraphrased = t5_paraphraser(sent)
t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
t5_paraphrasing_keywords.extend(t5_keywords)
st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first')
st.dataframe(t5_keywords_df)
unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0]).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first')
total_end = time.time()-start1
return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
doc = st.text_area("Enter a custom document")
if doc:
t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = extract_paraphrased_sentences(doc)
# extract_paraphrased_article(input_list[0])
st.text(f'T5 PARAPHRASING RUNTIME: {total_end}\n')
st.subheader('\nOriginal Keywords Extracted:\n\n')
st.dataframe(original_keywords_df)
st.subheader('\nT5 Unique New Keywords Extracted:\n\n')
st.dataframe(unique_keywords_df)
st.subheader('\nT5 Keywords Extracted:\n\n')
st.dataframe(t5_keywords_df)
|