Spaces:
Sleeping
Sleeping
File size: 5,279 Bytes
17e34a5 94bf427 390758e 94bf427 3813c2d 94bf427 ac28e59 82e6a9a c3c2470 b1f3a7a cffceba c3c2470 11ef927 c3c2470 390758e c0d316e 414ba52 c0d316e 390758e c3c2470 c0d316e c3c2470 390758e c0d316e 390758e c3c2470 390758e c0d316e 9d0e6a8 b1f3a7a 390758e b1f3a7a 390758e 8ccf10b 390758e 7f2b3e5 390758e 42d4ded 390758e c3c2470 390758e c3c2470 414ba52 94bf427 390758e 414ba52 94bf427 c3c2470 414ba52 390758e 7f2b3e5 c3c2470 390758e c3c2470 7f2b3e5 8ccf10b 390758e f6ccaae 390758e 6de4d60 204d8e4 f6ccaae 414ba52 c3c2470 f6ccaae 390758e c3c2470 f6ccaae c3c2470 390758e 414ba52 c3c2470 390758e c3c2470 390758e c3c2470 414ba52 c3c2470 414ba52 c3c2470 342a4a2 ac28e59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import gradio as gr
import os
import torch
from transformers import AutoTokenizer, AutoModel
from fpdf import FPDF
from gtts import gTTS
from pdfminer.high_level import extract_text
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
# Import spaCy and handle model loading
import spacy
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
# Download the model if not found
from spacy.cli import download
download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
# Load the LegalBERT model and tokenizer with use_fast=False
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased", use_fast=False)
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
# Convert DOCX to PDF using ReportLab
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
doc = Document(docx_file)
full_text = [para.text for para in doc.paragraphs]
pdf = canvas.Canvas(output_pdf, pagesize=letter)
pdf.setFont("Helvetica", 12)
text_object = pdf.beginText(40, 750)
for line in full_text:
text_object.textLine(line)
pdf.drawText(text_object)
pdf.save()
return output_pdf
# Extractive summarization using LegalBERT and spaCy
def extractive_summarization(text, num_sentences=5):
# Tokenize text into sentences using spaCy
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
# Handle case where document has fewer sentences than requested
num_sentences = min(num_sentences, len(sentences))
# Encode sentences
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
# Get sentence embeddings by averaging token embeddings
embeddings = outputs.last_hidden_state.mean(dim=1)
# Compute similarity of each sentence to the document embedding
document_embedding = embeddings.mean(dim=0, keepdim=True)
similarities = torch.nn.functional.cosine_similarity(embeddings, document_embedding)
# Select top sentences based on similarity scores
top_k = torch.topk(similarities, k=num_sentences)
selected_indices = top_k.indices.sort().values # Sort indices to maintain original order
summary_sentences = [sentences[idx] for idx in selected_indices]
# Combine sentences into summary
summary = ' '.join(summary_sentences)
return summary
# Process input file (PDF or DOCX)
def pdf_to_text(text, PDF, num_sentences=5):
try:
if PDF is not None:
file_extension = os.path.splitext(PDF.name)[1].lower()
if file_extension == '.docx':
pdf_file_path = docx_to_pdf(PDF.name)
text = extract_text(pdf_file_path)
elif file_extension == '.pdf':
text = extract_text(PDF.name)
else:
return None, "Unsupported file type", None
elif text != "":
pass # Use the text input provided by the user
else:
return None, "Please provide input text or upload a file.", None
summary = extractive_summarization(text, num_sentences)
# Generate a PDF of the summary
pdf = FPDF()
pdf.add_page()
pdf.set_font("Times", size=12)
pdf.multi_cell(190, 10, txt=summary, align='L')
pdf_output_path = "legal_summary.pdf"
pdf.output(pdf_output_path)
# Generate an audio file of the summary
audio_output_path = "legal_summary.wav"
tts = gTTS(text=summary, lang='en', slow=False)
tts.save(audio_output_path)
return audio_output_path, summary, pdf_output_path
except Exception as e:
return None, f"An error occurred: {str(e)}", None
# Preloaded document handler
def process_sample_document(num_sentences=5):
sample_document_path = "Marbury v. Madison.pdf"
with open(sample_document_path, "rb") as f:
return pdf_to_text("", f, num_sentences)
# Gradio interface
with gr.Blocks() as iface:
with gr.Row():
process_sample_button = gr.Button("Summarize Marbury v. Madison Case (Pre-Uploaded)")
text_input = gr.Textbox(label="Input Text")
file_input = gr.File(label="Upload PDF or DOCX")
slider = gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Number of Summary Sentences")
audio_output = gr.Audio(label="Generated Audio")
summary_output = gr.Textbox(label="Generated Summary")
pdf_output = gr.File(label="Summary PDF")
# Update the function calls to match new parameters
process_sample_button.click(
fn=process_sample_document,
inputs=slider,
outputs=[audio_output, summary_output, pdf_output]
)
# Use submit event for the text input and file input
def on_submit(text, file, num_sentences):
return pdf_to_text(text, file, num_sentences)
text_input.submit(
fn=on_submit,
inputs=[text_input, file_input, slider],
outputs=[audio_output, summary_output, pdf_output]
)
file_input.change(
fn=on_submit,
inputs=[text_input, file_input, slider],
outputs=[audio_output, summary_output, pdf_output]
)
if __name__ == "__main__":
iface.launch()
|