Spaces:
Sleeping
Sleeping
File size: 4,043 Bytes
d8f23da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import streamlit as st
from langchain.document_loaders import ArxivLoader
from transformers import pipeline
def strip(content):
content = str(content)
#print(content)
content = content.split("\n")
content = " ".join(content)
#print(content)
return content
def clip(content):
loc_intro = content.find("Introduction")
loc_refer = content.rfind("Reference")
if loc_intro !=-1:
if loc_refer !=-1:
content = content[loc_intro:loc_refer]
else:
content = content[loc_intro:]
print("Warning: Paper Doesn't have a References Title, may lead to overlap of references in summary")
else:
print("Warning: Paper Doesn't Have an Introduction Title, these may lead to overlap of summarization")
return content
def chunk(content):
print("-----Clipping content between Intro and References--------")
content = clip(content)
sent = []
c= 0
k = ""
content = content.split(". ")
for i in range(len(content)):
k = k + content[i] + ". "
c = c+1
if c == 10:
sent.append(k)
c = 0
k = ""
elif i==len(content)-1:
sent.append(k)
return sent
def summarize(sent):
model_str = "Falconsai/text_summarization"
tokenizer_str = "Falconsai/text_summarization"
summarizer = pipeline("summarization", model=model_str, tokenizer = tokenizer_str)
summarized = ""
for i in sent:
s = summarizer(i, max_length=256, min_length=64, do_sample=False)
summarized = summarized + s[0]['summary_text'] +"\n"
return summarized
def doc_load(search_query="default", n_docs=1):
if search_query == "default":
return [" ", " "], [" ", " "], [" ", " "]
try :
print("-------searching Paper----------")
docs = ArxivLoader(query=search_query, load_max_docs=n_docs).load()
titles = []
n_pairs = {}
for i in range(n_docs):
title = docs[i].metadata['Title']
titles.append(title)
n_pairs[title] = i
return titles, docs, n_pairs
except Exception as e:
print("--------ERROR while Trying to Search Paper-------------")
print(e)
def run(choice, docs, n_pairs):
ch = n_pairs[choice]
st.text("Fetching Metadata")
print("-----fetching metadata-------------")
metadata = docs[ch].metadata
content = docs[ch].page_content
print("----stripping new lines----------")
content = strip(content)
print("-----------chunking content--------------")
sent = chunk(content)
st.text("Chunking Text....")
st.text("🤔 Shortening text...")
print("----summarizing content---------")
summarized = summarize(sent)
out = "Date: "+ str(metadata['Published']) + "\n" + "\n Title: "+ metadata['Title'] + "\n" + "\n Authors: " + metadata['Authors'] + "\n" + "\n Summary: \n" + summarized
return out
st.title("ArXiV Summarizer")
titles = []
with st.form(key="search_form"):
col1, col2 = st.columns(2)
with col1:
search_query = st.text_input("Search Using Paper ID or Name*")
with col2:
n_docs = st.selectbox(label="Number of Documents to Load", options=(1, 2, 3, 4, 5, 6, 7, 8, 9 ,10))
submit = st.form_submit_button(label="Search")
if submit:
c = "Fetching Papers 🤔 "
st.write(c)
try:
titles, docs, n_pairs = doc_load(search_query=search_query, n_docs=n_docs)
except Exception as e:
print(e)
if titles:
c = "Papers Fetched 🤩 "
st.write(c)
else:
c = "Error while Fetching Papers 😥 Please Check ID or Name"
st.write(c)
label = "Papers for " + search_query
with st.form(key="paper_form"):
paper_name = st.selectbox(label=label, options=titles)
submit_paper = st.form_submit_button(label="Fetch Paper & Summarize")
print(submit_paper)
if submit_paper:
st.text("Reading Document.... 📄 ")
output = run(paper_name, docs, n_pairs)
st.text_area(label = "Summary", value=output, height = 650)
st.text('''* - Please Use Paper ID (Example : 2301.10172) as it will give accurate results.
Free text search can give errors sometimes''')
st.text("While using Paper ID no need to change Number of Documents to load") |