lincolnlegalbart

Sleeping

App Files Files Community

arithescientist commited on Oct 11, 2024

Commit

390758e

verified ·

1 Parent(s): b02ae77

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -65

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import gradio as gr
 import os
 import nltk
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from fpdf import FPDF
 from gtts import gTTS
 from pdfminer.high_level import extract_text
@@ -10,97 +11,93 @@ from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
 nltk.download('punkt')
-# Load both models and tokenizers
-# Default BART model
-tokenizer_default = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
-model_default = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
-# Legal-specific Pegasus model
-tokenizer_legal = AutoTokenizer.from_pretrained("nlpaueb/legal-pegasus-base")
-model_legal = AutoModelForSeq2SeqLM.from_pretrained("nlpaueb/legal-pegasus-base")
 # Convert DOCX to PDF using ReportLab
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     doc = Document(docx_file)
-    full_text = []
-    for para in doc.paragraphs:
-        full_text.append(para.text)
     pdf = canvas.Canvas(output_pdf, pagesize=letter)
     pdf.setFont("Helvetica", 12)
-    text = pdf.beginText(40, 750)
     for line in full_text:
-        text.textLine(line)
-    pdf.drawText(text)
     pdf.save()
     return output_pdf
 # Process input file (PDF or DOCX)
-def pdf_to_text(text, PDF, min_length=20, model_selected="Default BART"):
     try:
-        # Select the appropriate model and tokenizer based on user choice
-        if model_selected == "Default BART":
-            tokenizer = tokenizer_default
-            model = model_default
-        elif model_selected == "Legal Pegasus":
-            tokenizer = tokenizer_legal
-            model = model_legal
         else:
-            tokenizer = tokenizer_default
-            model = model_default
-        file_extension = os.path.splitext(PDF.name)[1].lower()
-        if file_extension == '.docx':
-            pdf_file_path = docx_to_pdf(PDF.name)
-            text = extract_text(pdf_file_path)
-        elif file_extension == '.pdf' and text == "":
-            text = extract_text(PDF.name)
-        # Tokenize and summarize the text using the selected model
-        inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
-        min_length = int(min_length)
-        summary_ids = model.generate(
-            inputs["input_ids"],
-            num_beams=4,
-            min_length=min_length,
-            max_length=min_length+500,
-            early_stopping=True
-        )
-        output_text = tokenizer.batch_decode(
-            summary_ids,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False
-        )[0]
         # Generate a PDF of the summary
         pdf = FPDF()
         pdf.add_page()
         pdf.set_font("Times", size=12)
-        pdf.multi_cell(190, 10, txt=output_text, align='C')
         pdf_output_path = "legal_summary.pdf"
         pdf.output(pdf_output_path)
         # Generate an audio file of the summary
         audio_output_path = "legal_summary.wav"
-        tts = gTTS(text=output_text, lang='en', slow=False)
         tts.save(audio_output_path)
-        return audio_output_path, output_text, pdf_output_path
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
 # Preloaded document handler
-def process_sample_document(min_length=20, model_selected="Default BART"):
     sample_document_path = "Marbury v. Madison.pdf"
     with open(sample_document_path, "rb") as f:
-        return pdf_to_text("", f, min_length, model_selected)
 # Gradio interface
 with gr.Blocks() as iface:
@@ -109,27 +106,32 @@ with gr.Blocks() as iface:
     text_input = gr.Textbox(label="Input Text")
     file_input = gr.File(label="Upload PDF or DOCX")
-    slider = gr.Slider(minimum=10, maximum=500, step=10, value=100, label="Summary Minimum Length")
-    model_choice = gr.Dropdown(
-        choices=["Default BART", "Legal Pegasus"],
-        value="Default BART",
-        label="Choose Summarization Model"
-    )
     audio_output = gr.Audio(label="Generated Audio")
     summary_output = gr.Textbox(label="Generated Summary")
     pdf_output = gr.File(label="Summary PDF")
     process_sample_button.click(
         fn=process_sample_document,
-        inputs=[slider, model_choice],
         outputs=[audio_output, summary_output, pdf_output]
     )
     file_input.change(
-        fn=pdf_to_text,
-        inputs=[text_input, file_input, slider, model_choice],
         outputs=[audio_output, summary_output, pdf_output]
     )
 if __name__ == "__main__":
     iface.launch()

 import gradio as gr
 import os
 import nltk
+import torch
+from transformers import AutoTokenizer, AutoModel
 from fpdf import FPDF
 from gtts import gTTS
 from pdfminer.high_level import extract_text
 from reportlab.pdfgen import canvas
 nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
+# Load the LegalBERT model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
+model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
 # Convert DOCX to PDF using ReportLab
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     doc = Document(docx_file)
+    full_text = [para.text for para in doc.paragraphs]
     pdf = canvas.Canvas(output_pdf, pagesize=letter)
     pdf.setFont("Helvetica", 12)
+    text_object = pdf.beginText(40, 750)
     for line in full_text:
+        text_object.textLine(line)
+    pdf.drawText(text_object)
     pdf.save()
     return output_pdf
+# Extractive summarization using LegalBERT
+def extractive_summarization(text, num_sentences=5):
+    # Tokenize text into sentences
+    sentences = sent_tokenize(text)
+    # Handle case where document has fewer sentences than requested
+    num_sentences = min(num_sentences, len(sentences))
+    # Encode sentences
+    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Get sentence embeddings by averaging token embeddings
+    embeddings = outputs.last_hidden_state.mean(dim=1)
+    # Compute similarity of each sentence to the document embedding
+    document_embedding = embeddings.mean(dim=0, keepdim=True)
+    similarities = torch.nn.functional.cosine_similarity(embeddings, document_embedding)
+    # Select top sentences based on similarity scores
+    top_k = torch.topk(similarities, k=num_sentences)
+    selected_indices = top_k.indices.sort().values  # Sort indices to maintain original order
+    summary_sentences = [sentences[idx] for idx in selected_indices]
+    # Combine sentences into summary
+    summary = ' '.join(summary_sentences)
+    return summary
 # Process input file (PDF or DOCX)
+def pdf_to_text(text, PDF, num_sentences=5):
     try:
+        if PDF is not None:
+            file_extension = os.path.splitext(PDF.name)[1].lower()
+            if file_extension == '.docx':
+                pdf_file_path = docx_to_pdf(PDF.name)
+                text = extract_text(pdf_file_path)
+            elif file_extension == '.pdf':
+                text = extract_text(PDF.name)
+            else:
+                return None, "Unsupported file type", None
+        elif text != "":
+            pass  # Use the text input provided by the user
         else:
+            return None, "Please provide input text or upload a file.", None
+        summary = extractive_summarization(text, num_sentences)
         # Generate a PDF of the summary
         pdf = FPDF()
         pdf.add_page()
         pdf.set_font("Times", size=12)
+        pdf.multi_cell(190, 10, txt=summary, align='L')
         pdf_output_path = "legal_summary.pdf"
         pdf.output(pdf_output_path)
         # Generate an audio file of the summary
         audio_output_path = "legal_summary.wav"
+        tts = gTTS(text=summary, lang='en', slow=False)
         tts.save(audio_output_path)
+        return audio_output_path, summary, pdf_output_path
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
 # Preloaded document handler
+def process_sample_document(num_sentences=5):
     sample_document_path = "Marbury v. Madison.pdf"
     with open(sample_document_path, "rb") as f:
+        return pdf_to_text("", f, num_sentences)
 # Gradio interface
 with gr.Blocks() as iface:
     text_input = gr.Textbox(label="Input Text")
     file_input = gr.File(label="Upload PDF or DOCX")
+    slider = gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Number of Summary Sentences")
     audio_output = gr.Audio(label="Generated Audio")
     summary_output = gr.Textbox(label="Generated Summary")
     pdf_output = gr.File(label="Summary PDF")
+    # Update the function calls to match new parameters
     process_sample_button.click(
         fn=process_sample_document,
+        inputs=slider,
+        outputs=[audio_output, summary_output, pdf_output]
+    )
+    # Use submit event for the text input and file input
+    def on_submit(text, file, num_sentences):
+        return pdf_to_text(text, file, num_sentences)
+    text_input.submit(
+        fn=on_submit,
+        inputs=[text_input, file_input, slider],
         outputs=[audio_output, summary_output, pdf_output]
     )
     file_input.change(
+        fn=on_submit,
+        inputs=[text_input, file_input, slider],
         outputs=[audio_output, summary_output, pdf_output]
     )
 if __name__ == "__main__":
     iface.launch()