Spaces:

valurank
/

keyword-extraction-demo

Build error

File size: 4,680 Bytes

02ac4f2
fb6f6f9
02ac4f2
 
 
7f3cfe8
44f0002
7f3cfe8
 
 
 
 
 
 
 
52022f7
7f3cfe8
 
 
 
52022f7
 
7f3cfe8
48029cd
 
 
 
 
 
 
 
 
 
 
 
 
 
7f3cfe8
48029cd
7f3cfe8
 
 
 
48029cd
 
94ab8d9
7f3cfe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48029cd
 
7f3cfe8
2f9e880
7f3cfe8
2f9e880
 
 
 
 
 
 
 
 
48029cd
2f9e880
 
 
 
 
 
 
 
 
48029cd
7f3cfe8
2f9e880
 
 
a52f9ec
2f9e880
7f3cfe8
2f9e880
7f3cfe8
2f9e880
7f3cfe8
 
4158e04
7f3cfe8
02ac4f2
 
 
 
 
 
 
 
 
48029cd
 
 
02ac4f2


import time
import pandas as pd

import torch
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from keyphrase_vectorizers import KeyphraseCountVectorizer
from transformers import T5ForConditionalGeneration,T5Tokenizer

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
nltk.download('punkt')

from huggingface_hub import snapshot_download, HfFolder
import streamlit as st

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

HfFolder.save_token(st.secrets["hf-auth-token"])


@st.cache(allow_output_mutation=True)
def load_model():
    # Load KeyBert Model
    tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
    kw_extractor = KeyBERT(tmp_model)
    
    # Load T5 for Paraphrasing
    t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
    t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
    t5_model = t5_model.to(device)
    return kw_extractor, t5_model, t5_tokenizer
    
kw_extractor, t5_model, t5_tokenizer = load_model()


@st.cache()
def get_keybert_results_with_vectorizer(text, number_of_results=20):
    keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
    return keywords


@st.cache()
def t5_paraphraser(text, number_of_results=5):
    text =  "paraphrase: " + text + " </s>"
    max_len = 2048
    encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    beam_outputs = t5_model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=2048,
        top_k=50,
        top_p=0.95,
        early_stopping=True,
        num_return_sequences=number_of_results
    )
    
    final_outputs =[]
    for beam_output in beam_outputs:
        sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        final_outputs.append(sent)
    
    return final_outputs
    
    
#### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
def extract_paraphrased_sentences(article):
     
    start1 = time.time()
    with st.spinner('Extraction Keywords from Original Document...'):
        original_keywords = [(i[0], i[1]) for i in get_keybert_results_with_vectorizer(article)]
    
        article_sentences = sent_tokenize(article)
        target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]        
    st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))

    
    start2 = time.time()
    with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
        t5_paraphrasing_keywords = []
    
        for sent in target_sentences:
            ### T5
            t5_paraphrased = t5_paraphraser(sent)
            t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
            t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
    
            t5_paraphrasing_keywords.extend(t5_keywords)
    st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))

    original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
    
    t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first')
    st.dataframe(t5_keywords_df)
    unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0]).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first')

    total_end = time.time()-start1

    return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
    

doc = st.text_area("Enter a custom document")

if doc:
    t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = extract_paraphrased_sentences(doc)
    
    # extract_paraphrased_article(input_list[0])
    st.text(f'T5 PARAPHRASING RUNTIME: {total_end}\n')
    
    st.subheader('\nOriginal Keywords Extracted:\n\n')
    st.dataframe(original_keywords_df)
    
    st.subheader('\nT5 Unique New Keywords Extracted:\n\n')
    st.dataframe(unique_keywords_df)
       
    st.subheader('\nT5 Keywords Extracted:\n\n')
    st.dataframe(t5_keywords_df)