File size: 5,279 Bytes
17e34a5
94bf427
390758e
 
94bf427
3813c2d
94bf427
ac28e59
82e6a9a
 
c3c2470
 
b1f3a7a
cffceba
c3c2470
 
 
 
 
 
 
11ef927
c3c2470
 
390758e
c0d316e
414ba52
c0d316e
 
390758e
c3c2470
c0d316e
 
c3c2470
390758e
c0d316e
390758e
c3c2470
390758e
c0d316e
 
9d0e6a8
b1f3a7a
390758e
b1f3a7a
 
 
390758e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ccf10b
390758e
7f2b3e5
390758e
 
 
 
 
 
 
 
 
 
 
42d4ded
390758e
c3c2470
390758e
c3c2470
414ba52
94bf427
 
 
390758e
414ba52
94bf427
c3c2470
414ba52
 
390758e
7f2b3e5
c3c2470
390758e
c3c2470
7f2b3e5
 
 
8ccf10b
390758e
f6ccaae
 
390758e
6de4d60
204d8e4
f6ccaae
 
414ba52
c3c2470
f6ccaae
 
390758e
c3c2470
f6ccaae
 
 
c3c2470
390758e
414ba52
c3c2470
 
390758e
 
 
 
 
c3c2470
390758e
c3c2470
 
414ba52
 
 
c3c2470
 
414ba52
 
c3c2470
342a4a2
ac28e59
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import gradio as gr
import os
import torch
from transformers import AutoTokenizer, AutoModel
from fpdf import FPDF
from gtts import gTTS
from pdfminer.high_level import extract_text
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

# Import spaCy and handle model loading
import spacy

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    # Download the model if not found
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Load the LegalBERT model and tokenizer with use_fast=False
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased", use_fast=False)
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

# Convert DOCX to PDF using ReportLab
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
    doc = Document(docx_file)
    full_text = [para.text for para in doc.paragraphs]

    pdf = canvas.Canvas(output_pdf, pagesize=letter)
    pdf.setFont("Helvetica", 12)

    text_object = pdf.beginText(40, 750)
    for line in full_text:
        text_object.textLine(line)

    pdf.drawText(text_object)
    pdf.save()
    return output_pdf

# Extractive summarization using LegalBERT and spaCy
def extractive_summarization(text, num_sentences=5):
    # Tokenize text into sentences using spaCy
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    # Handle case where document has fewer sentences than requested
    num_sentences = min(num_sentences, len(sentences))
    # Encode sentences
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get sentence embeddings by averaging token embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)
    # Compute similarity of each sentence to the document embedding
    document_embedding = embeddings.mean(dim=0, keepdim=True)
    similarities = torch.nn.functional.cosine_similarity(embeddings, document_embedding)
    # Select top sentences based on similarity scores
    top_k = torch.topk(similarities, k=num_sentences)
    selected_indices = top_k.indices.sort().values  # Sort indices to maintain original order
    summary_sentences = [sentences[idx] for idx in selected_indices]
    # Combine sentences into summary
    summary = ' '.join(summary_sentences)
    return summary

# Process input file (PDF or DOCX)
def pdf_to_text(text, PDF, num_sentences=5):
    try:
        if PDF is not None:
            file_extension = os.path.splitext(PDF.name)[1].lower()
            if file_extension == '.docx':
                pdf_file_path = docx_to_pdf(PDF.name)
                text = extract_text(pdf_file_path)
            elif file_extension == '.pdf':
                text = extract_text(PDF.name)
            else:
                return None, "Unsupported file type", None
        elif text != "":
            pass  # Use the text input provided by the user
        else:
            return None, "Please provide input text or upload a file.", None

        summary = extractive_summarization(text, num_sentences)

        # Generate a PDF of the summary
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font("Times", size=12)
        pdf.multi_cell(190, 10, txt=summary, align='L')
        pdf_output_path = "legal_summary.pdf"
        pdf.output(pdf_output_path)

        # Generate an audio file of the summary
        audio_output_path = "legal_summary.wav"
        tts = gTTS(text=summary, lang='en', slow=False)
        tts.save(audio_output_path)

        return audio_output_path, summary, pdf_output_path

    except Exception as e:
        return None, f"An error occurred: {str(e)}", None

# Preloaded document handler
def process_sample_document(num_sentences=5):
    sample_document_path = "Marbury v. Madison.pdf"
    with open(sample_document_path, "rb") as f:
        return pdf_to_text("", f, num_sentences)

# Gradio interface
with gr.Blocks() as iface:
    with gr.Row():
        process_sample_button = gr.Button("Summarize Marbury v. Madison Case (Pre-Uploaded)")

    text_input = gr.Textbox(label="Input Text")
    file_input = gr.File(label="Upload PDF or DOCX")
    slider = gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Number of Summary Sentences")

    audio_output = gr.Audio(label="Generated Audio")
    summary_output = gr.Textbox(label="Generated Summary")
    pdf_output = gr.File(label="Summary PDF")

    # Update the function calls to match new parameters
    process_sample_button.click(
        fn=process_sample_document,
        inputs=slider,
        outputs=[audio_output, summary_output, pdf_output]
    )
    # Use submit event for the text input and file input
    def on_submit(text, file, num_sentences):
        return pdf_to_text(text, file, num_sentences)

    text_input.submit(
        fn=on_submit,
        inputs=[text_input, file_input, slider],
        outputs=[audio_output, summary_output, pdf_output]
    )
    file_input.change(
        fn=on_submit,
        inputs=[text_input, file_input, slider],
        outputs=[audio_output, summary_output, pdf_output]
    )

if __name__ == "__main__":
    iface.launch()