File size: 4,043 Bytes
d8f23da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import streamlit as st
from langchain.document_loaders import ArxivLoader 
from transformers import pipeline



def strip(content):
  content = str(content)
  #print(content)
  content = content.split("\n")
  content = " ".join(content)
  #print(content)

  return content

def clip(content):
  loc_intro = content.find("Introduction")
  loc_refer = content.rfind("Reference")
  if loc_intro !=-1:
    if loc_refer !=-1:
      content = content[loc_intro:loc_refer]
    else:
      content = content[loc_intro:]
      print("Warning: Paper Doesn't have a References Title, may lead to overlap of references in summary")
  else:
    print("Warning: Paper Doesn't Have an Introduction Title, these may lead to overlap of summarization")

  return content


def chunk(content):

  print("-----Clipping content between Intro and References--------")

  content = clip(content)

  sent = []
  c= 0
  k = ""
  content = content.split(". ")
  for i in range(len(content)):
    k = k + content[i] + ". "
    c = c+1
    if c == 10:
      sent.append(k)
      c = 0
      k = ""
    elif i==len(content)-1:
      sent.append(k)

  return sent


def summarize(sent):
  model_str = "Falconsai/text_summarization"
  tokenizer_str = "Falconsai/text_summarization"

  summarizer = pipeline("summarization", model=model_str, tokenizer = tokenizer_str)


  summarized = ""
  for i in sent:
    s = summarizer(i, max_length=256, min_length=64, do_sample=False)
    summarized = summarized + s[0]['summary_text'] +"\n"

  return summarized

def doc_load(search_query="default", n_docs=1):
  if search_query == "default":
    return [" ",  " "], [" ",  " "], [" ",  " "]
  try :
    print("-------searching Paper----------")
    docs = ArxivLoader(query=search_query, load_max_docs=n_docs).load()
    titles = []
    n_pairs = {}
    for i in range(n_docs):
      title = docs[i].metadata['Title']
      titles.append(title)
      n_pairs[title] = i
    return titles, docs, n_pairs
  except Exception as e:
    print("--------ERROR while Trying to Search Paper-------------")
    print(e)
    

def run(choice, docs, n_pairs):
  ch = n_pairs[choice]
  st.text("Fetching Metadata")
  print("-----fetching metadata-------------")
  metadata = docs[ch].metadata
  content = docs[ch].page_content 

  print("----stripping new lines----------")
  content = strip(content)
  print("-----------chunking content--------------")
  sent = chunk(content)
  st.text("Chunking Text....")
  st.text("🤔 Shortening text...")
  print("----summarizing content---------")
  summarized = summarize(sent)


  out = "Date: "+ str(metadata['Published']) + "\n" + "\n Title: "+ metadata['Title'] + "\n" + "\n Authors: " + metadata['Authors'] + "\n" + "\n Summary: \n" + summarized
  return out

st.title("ArXiV Summarizer")
titles = []
with st.form(key="search_form"):
    col1, col2 = st.columns(2)
    with col1:
        search_query = st.text_input("Search Using Paper ID or Name*")
    with col2:
        n_docs = st.selectbox(label="Number of Documents to Load", options=(1, 2, 3, 4, 5, 6, 7, 8, 9 ,10))
    submit = st.form_submit_button(label="Search")
if submit:
  c = "Fetching Papers 🤔 "
  st.write(c)
try:
  titles, docs, n_pairs = doc_load(search_query=search_query, n_docs=n_docs)
except Exception as e:  
  print(e)
if titles:
  c = "Papers Fetched 🤩 "
  st.write(c)
else:
  c = "Error while Fetching Papers 😥 Please Check ID or Name"
  st.write(c)

label = "Papers for " + search_query
with st.form(key="paper_form"):
  paper_name = st.selectbox(label=label, options=titles)
  submit_paper = st.form_submit_button(label="Fetch Paper & Summarize")
  print(submit_paper)
if submit_paper:
  st.text("Reading Document.... 📄 ")
  output = run(paper_name, docs, n_pairs)
  st.text_area(label = "Summary", value=output, height = 650)

st.text('''* - Please Use Paper ID (Example : 2301.10172) as it will give accurate results. 
          Free text search can give errors sometimes''')
st.text("While using Paper ID no need to change Number of Documents to load")