Spaces:
Sleeping
Sleeping
import streamlit as st | |
from langchain.document_loaders import ArxivLoader | |
from transformers import pipeline | |
def strip(content): | |
content = str(content) | |
#print(content) | |
content = content.split("\n") | |
content = " ".join(content) | |
#print(content) | |
return content | |
def clip(content): | |
loc_intro = content.find("Introduction") | |
loc_refer = content.rfind("Reference") | |
if loc_intro !=-1: | |
if loc_refer !=-1: | |
content = content[loc_intro:loc_refer] | |
else: | |
content = content[loc_intro:] | |
print("Warning: Paper Doesn't have a References Title, may lead to overlap of references in summary") | |
else: | |
print("Warning: Paper Doesn't Have an Introduction Title, these may lead to overlap of summarization") | |
return content | |
def chunk(content): | |
print("-----Clipping content between Intro and References--------") | |
content = clip(content) | |
sent = [] | |
c= 0 | |
k = "" | |
content = content.split(". ") | |
for i in range(len(content)): | |
k = k + content[i] + ". " | |
c = c+1 | |
if c == 10: | |
sent.append(k) | |
c = 0 | |
k = "" | |
elif i==len(content)-1: | |
sent.append(k) | |
return sent | |
def summarize(sent): | |
model_str = "Falconsai/text_summarization" | |
tokenizer_str = "Falconsai/text_summarization" | |
summarizer = pipeline("summarization", model=model_str, tokenizer = tokenizer_str) | |
summarized = "" | |
for i in sent: | |
s = summarizer(i, max_length=256, min_length=64, do_sample=False) | |
summarized = summarized + s[0]['summary_text'] +"\n" | |
return summarized | |
def doc_load(search_query="default", n_docs=1): | |
if search_query == "default": | |
return [" ", " "], [" ", " "], [" ", " "] | |
try : | |
print("-------searching Paper----------") | |
docs = ArxivLoader(query=search_query, load_max_docs=n_docs).load() | |
titles = [] | |
n_pairs = {} | |
for i in range(n_docs): | |
title = docs[i].metadata['Title'] | |
titles.append(title) | |
n_pairs[title] = i | |
return titles, docs, n_pairs | |
except Exception as e: | |
print("--------ERROR while Trying to Search Paper-------------") | |
print(e) | |
def run(choice, docs, n_pairs): | |
ch = n_pairs[choice] | |
st.text("Fetching Metadata") | |
print("-----fetching metadata-------------") | |
metadata = docs[ch].metadata | |
content = docs[ch].page_content | |
print("----stripping new lines----------") | |
content = strip(content) | |
print("-----------chunking content--------------") | |
sent = chunk(content) | |
st.text("Chunking Text....") | |
st.text("🤔 Shortening text...") | |
print("----summarizing content---------") | |
summarized = summarize(sent) | |
out = "Date: "+ str(metadata['Published']) + "\n" + "\n Title: "+ metadata['Title'] + "\n" + "\n Authors: " + metadata['Authors'] + "\n" + "\n Summary: \n" + summarized | |
return out | |
st.title("ArXiV Summarizer") | |
titles = [] | |
with st.form(key="search_form"): | |
col1, col2 = st.columns(2) | |
with col1: | |
search_query = st.text_input("Search Using Paper ID or Name*") | |
with col2: | |
n_docs = st.selectbox(label="Number of Documents to Load", options=(1, 2, 3, 4, 5, 6, 7, 8, 9 ,10)) | |
submit = st.form_submit_button(label="Search") | |
if submit: | |
c = "Fetching Papers 🤔 " | |
st.write(c) | |
try: | |
titles, docs, n_pairs = doc_load(search_query=search_query, n_docs=n_docs) | |
except Exception as e: | |
print(e) | |
if titles: | |
c = "Papers Fetched 🤩 " | |
st.write(c) | |
else: | |
c = "Error while Fetching Papers 😥 Please Check ID or Name" | |
st.write(c) | |
label = "Papers for " + search_query | |
with st.form(key="paper_form"): | |
paper_name = st.selectbox(label=label, options=titles) | |
submit_paper = st.form_submit_button(label="Fetch Paper & Summarize") | |
print(submit_paper) | |
if submit_paper: | |
st.text("Reading Document.... 📄 ") | |
output = run(paper_name, docs, n_pairs) | |
st.text_area(label = "Summary", value=output, height = 650) | |
st.text('''* - Please Use Paper ID (Example : 2301.10172) as it will give accurate results. | |
Free text search can give errors sometimes''') | |
st.text("While using Paper ID no need to change Number of Documents to load") |