eHemink's picture
Update app.py
6d660d9
raw
history blame
3.04 kB
# -*- coding: utf-8 -*-
"""app.py.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1zk7xuWSf7ii7zowOqNVLy0FwXYVHYE2V
"""
# Imports
!pip install PyPDF2
import PyPDF2
import re
!pip install transformers
import transformers
from transformers import pipeline
!pip install git+https://github.com/suno-ai/bark.git
from bark import SAMPLE_RATE, generate_audio, preload_models
from scipy.io.wavfile import write as write_wav
from IPython.display import Audio
!pip install gradio
import gradio as gr
# Code
def abstract_to_audio(insert_pdf):
# Extracting the abstract text from the article pdf
def extract_abstract(pdf_file):
# Open the PDF file in read-binary mode
with open(pdf_file, 'rb') as file:
# Create a PDF reader object
pdf_reader = PyPDF2.PdfReader(file)
# Initialize an empty string to store abstract content
abstract_text = ''
# Loop through each page in the PDF
for page_num in range(len(pdf_reader.pages)):
# Get the text from the current page
page = pdf_reader.pages[page_num]
text = page.extract_text()
# Use regular expression to find the "Abstract" section
abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)
if abstract_match:
# Get the text after the "Abstract" heading until the next section, indicated by "Introduction" heading
start_index = abstract_match.end()
next_section_match = re.search(r'\bIntroduction\b', text[start_index:])
if next_section_match:
end_index = start_index + next_section_match.start()
abstract_text = text[start_index:end_index]
else:
# If no next section found, extract text till the end
abstract_text = text[start_index:]
break # Exit loop once abstract is found
return abstract_text.strip()
abstract = extract_abstract(insert_pdf)
# Creating a summarization pipeline
model = "lidiya/bart-large-xsum-samsum"
pipeline1 = pipeline(task = "summarization", model = model)
# Summarizing the extracted abstract
summarized = pipeline1(abstract)
print(summarized[0]['summary_text'])
tss_prompt = summarized[0]['summary_text']
# Generate audio file that speaks the generated sentence using Bark
# download and load all models
preload_models()
# generate audio from text
text_prompt = tss_prompt
audio_array = generate_audio(text_prompt)
# play text in notebook
return Audio(audio_array, rate=SAMPLE_RATE)
my_app = gr.Interface(fn=abstract_to_audio, inputs='file', outputs='audio', title="PDF Abstract Summarizer",
description="Extracts abstracts from PDFs and generates audio summaries.")
my_app.launch(share=True)