lincolnlegal / app.py
Ari
Update app.py
7f2b3e5 verified
raw
history blame
1.97 kB
import gradio as gr
import os
import nltk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from fpdf import FPDF
from gtts import gTTS
from pdfminer.high_level import extract_text
nltk.download('punkt')
# Load the models and tokenizers once, not every time the function is called
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
# Main processing function
def pdf_to_text(text, PDF, min_length=20):
try:
# Extract text from PDF if no input text provided
if text == "":
text = extract_text(PDF.name)
# Tokenize text
inputs = tokenizer([text], max_length=1024, return_tensors="pt")
min_length = int(min_length)
# Generate summary
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length+1000)
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
# Save summarized text to PDF
pdf = FPDF()
pdf.add_page()
pdf.set_font("Times", size=12)
pdf.multi_cell(190, 10, txt=output_text, align='C')
pdf_output_path = "legal.pdf"
pdf.output(pdf_output_path)
# Convert summarized text to audio
audio_output_path = "legal.wav"
tts = gTTS(text=output_text, lang='en', slow=False)
tts.save(audio_output_path)
return audio_output_path, output_text, pdf_output_path
except Exception as e:
return None, f"An error occurred: {str(e)}", None
# Gradio interface
iface = gr.Interface(
fn=pdf_to_text,
inputs=["text", gr.inputs.File(label="Upload PDF"), gr.inputs.Slider(minimum=10, maximum=100, step=10, default=20, label="Summary Minimum Length")],
outputs=["audio", "text", "file"]
)
if __name__ == "__main__":
iface.launch()