Spaces:
Build error
Build error
intial commit
Browse files- app.py +105 -0
- requirements.txt +7 -0
app.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
# import langchain
|
| 3 |
+
import PyPDF2
|
| 4 |
+
import os
|
| 5 |
+
from transformers import BartTokenizer , BartForConditionalGeneration
|
| 6 |
+
|
| 7 |
+
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
|
| 8 |
+
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def save_uploaded_file(uploaded_file):
|
| 12 |
+
temp_dir = "temp_files"
|
| 13 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 14 |
+
file_path = os.path.join(temp_dir, uploaded_file.name)
|
| 15 |
+
with open(file_path, "wb") as f:
|
| 16 |
+
f.write(uploaded_file.getbuffer())
|
| 17 |
+
return file_path
|
| 18 |
+
|
| 19 |
+
# Function to extract text from PDF
|
| 20 |
+
def extract_text_from_pdf(pdf_file):
|
| 21 |
+
text = ""
|
| 22 |
+
RP_file = save_uploaded_file(pdf_file)
|
| 23 |
+
with open(RP_file, "rb") as file:
|
| 24 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
| 25 |
+
num_pages = len(pdf_reader.pages)
|
| 26 |
+
for page_num in range(num_pages):
|
| 27 |
+
page = pdf_reader.pages[page_num]
|
| 28 |
+
text += page.extract_text()
|
| 29 |
+
return text
|
| 30 |
+
|
| 31 |
+
def generate_summary(text: str):
|
| 32 |
+
# Tokenize the text
|
| 33 |
+
tokens = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
|
| 34 |
+
summary_ids = model.generate(tokens.input_ids, num_beams = 4, max_length = 200, early_stopping = True)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
return summary_ids
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# Function to summarize text
|
| 43 |
+
def summarize_text(text: str) -> str:
|
| 44 |
+
summary_ids = generate_summary(text)
|
| 45 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_ip_tokenization_spaces=False)
|
| 46 |
+
return summary
|
| 47 |
+
|
| 48 |
+
# Function to extract key information from the paper
|
| 49 |
+
def extract_paper_info(text):
|
| 50 |
+
# Logic to extract key information from the paper (e.g., using regex, NLP techniques)
|
| 51 |
+
# This part can be expanded based on the specific requirements
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Function to build and fine-tune the chatbot
|
| 56 |
+
def build_chatbot():
|
| 57 |
+
# Fine-tuning language model for chatbot using Langchain
|
| 58 |
+
lang_model = ''
|
| 59 |
+
|
| 60 |
+
# Additional fine-tuning steps can be added here
|
| 61 |
+
|
| 62 |
+
return lang_model
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# Main function to run the Streamlit app
|
| 66 |
+
def main():
|
| 67 |
+
st.title("Research Paper Understanding Chatbot")
|
| 68 |
+
st.write("As of now supports only summarization.")
|
| 69 |
+
|
| 70 |
+
# Upload PDF file
|
| 71 |
+
uploaded_file = st.file_uploader("Upload a research paper (PDF)", type="pdf")
|
| 72 |
+
|
| 73 |
+
if uploaded_file is not None:
|
| 74 |
+
st.write("Paper uploaded successfully!")
|
| 75 |
+
|
| 76 |
+
# Extract text from PDF
|
| 77 |
+
text = extract_text_from_pdf(uploaded_file)
|
| 78 |
+
|
| 79 |
+
# Display summary of the paper
|
| 80 |
+
st.subheader("Summary of the Paper")
|
| 81 |
+
with st.spinner("Brewing a potion for your paper's essence..."):
|
| 82 |
+
summary = summarize_text(text)
|
| 83 |
+
st.write(summary)
|
| 84 |
+
|
| 85 |
+
# # Extract key information from the paper
|
| 86 |
+
# st.subheader("Key Information")
|
| 87 |
+
# paper_info = extract_paper_info(text)
|
| 88 |
+
# st.write(paper_info)
|
| 89 |
+
|
| 90 |
+
# # Build chatbot
|
| 91 |
+
# st.subheader("Chatbot")
|
| 92 |
+
# chatbot = build_chatbot()
|
| 93 |
+
|
| 94 |
+
# # Chat interface
|
| 95 |
+
# user_input = st.text_input("You: ")
|
| 96 |
+
# if user_input:
|
| 97 |
+
# response = chatbot.generate_response(user_input)
|
| 98 |
+
# st.write("Chatbot:", response)
|
| 99 |
+
|
| 100 |
+
else:
|
| 101 |
+
st.write("Please upload a PDF file")
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
transformers
|
| 3 |
+
langchain
|
| 4 |
+
PyPDF2
|
| 5 |
+
tensorflow
|
| 6 |
+
tf-keras
|
| 7 |
+
torch
|