File size: 3,315 Bytes
72abb26
 
 
 
 
 
 
 
 
3ba7fb6
 
 
72abb26
 
d4d88c0
72abb26
be2266d
72abb26
6bf30eb
 
 
b54ebb1
72abb26
3ba7fb6
72abb26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49e2b72
72abb26
 
49e2b72
 
be2266d
49e2b72
90b7044
49e2b72
72abb26
9cbc782
72abb26
 
b54ebb1
 
 
90b7044
 
9c0c4cf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""app.py.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1zk7xuWSf7ii7zowOqNVLy0FwXYVHYE2V
"""

# https://huggingface.co/spaces/eHemink/assessment3_part2

# Here are the imports
import PyPDF2
import re
import numpy as np
import transformers
import scipy
from transformers import pipeline
#from bark import SAMPLE_RATE, generate_audio, preload_models
#from scipy.io.wavfile import write as write_wav
#from IPython.display import Audio
import gradio as gr

# Here is the code
def abstract_to_audio(insert_pdf):
    # Extracting the abstract text from the article pdf
    def extract_abstract(pdf_file):
        # Open the PDF file in read-binary mode
        with open(pdf_file, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)

            # Initialize an empty string to store abstract content
            abstract_text = ''

            # Loop through each page in the PDF
            for page_num in range(len(pdf_reader.pages)):
                # Get the text from the current page
                page = pdf_reader.pages[page_num]
                text = page.extract_text()

                # Use regular expression to find the "Abstract" section
                abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)
                if abstract_match:
                    # Get the text after the "Abstract" heading until the next section, indicated by "Introduction" heading
                    start_index = abstract_match.end()
                    next_section_match = re.search(r'\bIntroduction\b', text[start_index:])
                    if next_section_match:
                        end_index = start_index + next_section_match.start()
                        abstract_text = text[start_index:end_index]
                    else:
                        # If no next section found, extract text till the end
                        abstract_text = text[start_index:]
                    break  # Exit loop once abstract is found

            return abstract_text.strip()


    abstract = extract_abstract(insert_pdf)

    # Creating a summarization pipeline
    model = "lidiya/bart-large-xsum-samsum"
    pipeline1 = pipeline(task = "summarization", model = model)

    # Summarizing the extracted abstract
    summarized = pipeline1(abstract)
    print(summarized[0]['summary_text'])
    tss_prompt = summarized[0]['summary_text']

    # Generate audio file that speaks the generated sentence using Bark
    # download and load all models
    #preload_models()

    # generate audio from text
    #audio_array = generate_audio(tss_prompt)
    #return (SAMPLE_RATE, audio_array)

    tss_pipeline = pipeline("text-to-speech", "suno/bark-small")
    speech = tss_pipeline(tss_prompt)
    return (speech["sampling_rate"], speech["audio"])






my_app = gr.Interface(fn=abstract_to_audio, inputs='file', outputs='audio', title="PDF Abstract Summarizer",
    description="Extracts abstracts from PDFs and generates audio summaries. This app only accepts PDFs with abstracts.", examples=["files/Article 11 Hidden Technical Debt in Machine Learning Systems", "files/
Article 6 BloombergGPT_ A Large Language Model for Finance"])
my_app.launch()