# -*- coding: utf-8 -*- """app.py.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1zk7xuWSf7ii7zowOqNVLy0FwXYVHYE2V """ # Imports !pip install PyPDF2 import PyPDF2 import re !pip install transformers import transformers from transformers import pipeline !pip install git+https://github.com/suno-ai/bark.git from bark import SAMPLE_RATE, generate_audio, preload_models from scipy.io.wavfile import write as write_wav from IPython.display import Audio !pip install gradio import gradio as gr # Code def abstract_to_audio(insert_pdf): # Extracting the abstract text from the article pdf def extract_abstract(pdf_file): # Open the PDF file in read-binary mode with open(pdf_file, 'rb') as file: # Create a PDF reader object pdf_reader = PyPDF2.PdfReader(file) # Initialize an empty string to store abstract content abstract_text = '' # Loop through each page in the PDF for page_num in range(len(pdf_reader.pages)): # Get the text from the current page page = pdf_reader.pages[page_num] text = page.extract_text() # Use regular expression to find the "Abstract" section abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE) if abstract_match: # Get the text after the "Abstract" heading until the next section, indicated by "Introduction" heading start_index = abstract_match.end() next_section_match = re.search(r'\bIntroduction\b', text[start_index:]) if next_section_match: end_index = start_index + next_section_match.start() abstract_text = text[start_index:end_index] else: # If no next section found, extract text till the end abstract_text = text[start_index:] break # Exit loop once abstract is found return abstract_text.strip() abstract = extract_abstract(insert_pdf) # Creating a summarization pipeline model = "lidiya/bart-large-xsum-samsum" pipeline1 = pipeline(task = "summarization", model = model) # Summarizing the extracted abstract summarized = pipeline1(abstract) print(summarized[0]['summary_text']) tss_prompt = summarized[0]['summary_text'] # Generate audio file that speaks the generated sentence using Bark # download and load all models preload_models() # generate audio from text text_prompt = tss_prompt audio_array = generate_audio(text_prompt) # play text in notebook return Audio(audio_array, rate=SAMPLE_RATE) my_app = gr.Interface(fn=abstract_to_audio, inputs='file', outputs='audio', title="PDF Abstract Summarizer", description="Extracts abstracts from PDFs and generates audio summaries.") my_app.launch(share=True)