Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""app.py.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1zk7xuWSf7ii7zowOqNVLy0FwXYVHYE2V | |
""" | |
# https://huggingface.co/spaces/eHemink/assessment3_part2 | |
# Here are the imports | |
import PyPDF2 | |
import re | |
import numpy as np | |
import transformers | |
import scipy | |
from transformers import pipeline | |
#from bark import SAMPLE_RATE, generate_audio, preload_models | |
#from scipy.io.wavfile import write as write_wav | |
#from IPython.display import Audio | |
import gradio as gr | |
# Here is the code | |
def abstract_to_audio(insert_pdf): | |
# Extracting the abstract text from the article pdf | |
def extract_abstract(pdf_file): | |
# Open the PDF file in read-binary mode | |
with open(pdf_file, 'rb') as file: | |
# Create a PDF reader object | |
pdf_reader = PyPDF2.PdfReader(file) | |
# Initialize an empty string to store abstract content | |
abstract_text = '' | |
# Loop through each page in the PDF | |
for page_num in range(len(pdf_reader.pages)): | |
# Get the text from the current page | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
# Use regular expression to find the "Abstract" section | |
abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE) | |
if abstract_match: | |
# Get the text after the "Abstract" heading until the next section, indicated by "Introduction" heading | |
start_index = abstract_match.end() | |
next_section_match = re.search(r'\bIntroduction\b', text[start_index:]) | |
if next_section_match: | |
end_index = start_index + next_section_match.start() | |
abstract_text = text[start_index:end_index] | |
else: | |
# If no next section found, extract text till the end | |
abstract_text = text[start_index:] | |
break # Exit loop once abstract is found | |
return abstract_text.strip() | |
abstract = extract_abstract(insert_pdf) | |
# Creating a summarization pipeline | |
model = "lidiya/bart-large-xsum-samsum" | |
pipeline1 = pipeline(task = "summarization", model = model) | |
# Summarizing the extracted abstract | |
summarized = pipeline1(abstract) | |
print(summarized[0]['summary_text']) | |
tss_prompt = summarized[0]['summary_text'] | |
# Generate audio file that speaks the generated sentence using Bark | |
# download and load all models | |
#preload_models() | |
# generate audio from text | |
#audio_array = generate_audio(tss_prompt) | |
#return (SAMPLE_RATE, audio_array) | |
tss_pipeline = pipeline("text-to-speech", "suno/bark-small") | |
speech = tss_pipeline(tss_prompt, forward_params={"do_sample": True}, pad_token_id=tss_pipeline.tokenizer.eos_token_id) | |
return (speech["sampling_rate"], speech["audio"]) | |
my_app = gr.Interface(fn=abstract_to_audio, inputs='file', outputs='audio', title="PDF Abstract Summarizer", | |
description="Extracts abstracts from PDFs and generates audio summaries. This app only accepts PDFs with abstracts.") | |
my_app.launch() |