File size: 3,020 Bytes
72abb26
 
 
 
 
 
 
 
 
3ba7fb6
 
0613b6c
72abb26
 
 
be2266d
72abb26
f8ca0d6
b54ebb1
369bf43
72abb26
3ba7fb6
72abb26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ca0d6
72abb26
 
f8ca0d6
 
be2266d
9cbc782
72abb26
 
b54ebb1
 
 
05a9ffe
cf97f39
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
"""app.py.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1zk7xuWSf7ii7zowOqNVLy0FwXYVHYE2V
"""

# https://huggingface.co/spaces/eHemink/assessment3_part2

# Here are imports
import PyPDF2
import re
import transformers
import scipy
from transformers import pipeline
from bark import SAMPLE_RATE, generate_audio, preload_models
import gradio as gr
import os

# Here is the code
def abstract_to_audio(insert_pdf):
    # Extracting the abstract text from the article pdf
    def extract_abstract(pdf_file):
        # Open the PDF file in read-binary mode
        with open(pdf_file, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)

            # Initialize an empty string to store abstract content
            abstract_text = ''

            # Loop through each page in the PDF
            for page_num in range(len(pdf_reader.pages)):
                # Get the text from the current page
                page = pdf_reader.pages[page_num]
                text = page.extract_text()

                # Use regular expression to find the "Abstract" section
                abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)
                if abstract_match:
                    # Get the text after the "Abstract" heading until the next section, indicated by "Introduction" heading
                    start_index = abstract_match.end()
                    next_section_match = re.search(r'\bIntroduction\b', text[start_index:])
                    if next_section_match:
                        end_index = start_index + next_section_match.start()
                        abstract_text = text[start_index:end_index]
                    else:
                        # If no next section found, extract text till the end
                        abstract_text = text[start_index:]
                    break  # Exit loop once abstract is found

            return abstract_text.strip()


    abstract = extract_abstract(insert_pdf)

    # Creating a summarization pipeline
    model = "lidiya/bart-large-xsum-samsum"
    pipeline1 = pipeline(task = "summarization", model = model)

    # Summarizing the extracted abstract
    summarized = pipeline1(abstract)
    print(summarized[0]['summary_text'])
    tss_prompt = summarized[0]['summary_text']

    # Generate audio file that speaks the generated sentence using Bark
    # download and load all models
    preload_models()

    # generate audio from text
    audio_array = generate_audio(tss_prompt)
    return (SAMPLE_RATE, audio_array)






my_app = gr.Interface(fn=abstract_to_audio, inputs='file', outputs='audio', title="PDF Abstract Summarizer",
    description="Extracts abstracts from PDFs and generates audio summaries. This app only accepts PDFs with abstracts.", examples=[os.path.join(os.path.dirname(__file__), "Hidden_Technical_Debt_in_MLSystems.pdf")],cache_examples=True)
my_app.launch()