arithescientist commited on
Commit
390758e
·
verified ·
1 Parent(s): b02ae77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -65
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
2
  import os
3
  import nltk
4
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
5
  from fpdf import FPDF
6
  from gtts import gTTS
7
  from pdfminer.high_level import extract_text
@@ -10,97 +11,93 @@ from reportlab.lib.pagesizes import letter
10
  from reportlab.pdfgen import canvas
11
 
12
  nltk.download('punkt')
 
13
 
14
- # Load both models and tokenizers
15
- # Default BART model
16
- tokenizer_default = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
17
- model_default = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
18
-
19
- # Legal-specific Pegasus model
20
- tokenizer_legal = AutoTokenizer.from_pretrained("nlpaueb/legal-pegasus-base")
21
- model_legal = AutoModelForSeq2SeqLM.from_pretrained("nlpaueb/legal-pegasus-base")
22
 
23
  # Convert DOCX to PDF using ReportLab
24
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
25
  doc = Document(docx_file)
26
- full_text = []
27
- for para in doc.paragraphs:
28
- full_text.append(para.text)
29
 
30
  pdf = canvas.Canvas(output_pdf, pagesize=letter)
31
  pdf.setFont("Helvetica", 12)
32
 
33
- text = pdf.beginText(40, 750)
34
  for line in full_text:
35
- text.textLine(line)
36
 
37
- pdf.drawText(text)
38
  pdf.save()
39
  return output_pdf
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # Process input file (PDF or DOCX)
42
- def pdf_to_text(text, PDF, min_length=20, model_selected="Default BART"):
43
  try:
44
- # Select the appropriate model and tokenizer based on user choice
45
- if model_selected == "Default BART":
46
- tokenizer = tokenizer_default
47
- model = model_default
48
- elif model_selected == "Legal Pegasus":
49
- tokenizer = tokenizer_legal
50
- model = model_legal
 
 
 
 
51
  else:
52
- tokenizer = tokenizer_default
53
- model = model_default
54
-
55
- file_extension = os.path.splitext(PDF.name)[1].lower()
56
 
57
- if file_extension == '.docx':
58
- pdf_file_path = docx_to_pdf(PDF.name)
59
- text = extract_text(pdf_file_path)
60
- elif file_extension == '.pdf' and text == "":
61
- text = extract_text(PDF.name)
62
-
63
- # Tokenize and summarize the text using the selected model
64
- inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
65
- min_length = int(min_length)
66
-
67
- summary_ids = model.generate(
68
- inputs["input_ids"],
69
- num_beams=4,
70
- min_length=min_length,
71
- max_length=min_length+500,
72
- early_stopping=True
73
- )
74
- output_text = tokenizer.batch_decode(
75
- summary_ids,
76
- skip_special_tokens=True,
77
- clean_up_tokenization_spaces=False
78
- )[0]
79
 
80
  # Generate a PDF of the summary
81
  pdf = FPDF()
82
  pdf.add_page()
83
  pdf.set_font("Times", size=12)
84
- pdf.multi_cell(190, 10, txt=output_text, align='C')
85
  pdf_output_path = "legal_summary.pdf"
86
  pdf.output(pdf_output_path)
87
 
88
  # Generate an audio file of the summary
89
  audio_output_path = "legal_summary.wav"
90
- tts = gTTS(text=output_text, lang='en', slow=False)
91
  tts.save(audio_output_path)
92
 
93
- return audio_output_path, output_text, pdf_output_path
94
 
95
  except Exception as e:
96
  return None, f"An error occurred: {str(e)}", None
97
 
98
  # Preloaded document handler
99
- def process_sample_document(min_length=20, model_selected="Default BART"):
100
  sample_document_path = "Marbury v. Madison.pdf"
101
-
102
  with open(sample_document_path, "rb") as f:
103
- return pdf_to_text("", f, min_length, model_selected)
104
 
105
  # Gradio interface
106
  with gr.Blocks() as iface:
@@ -109,27 +106,32 @@ with gr.Blocks() as iface:
109
 
110
  text_input = gr.Textbox(label="Input Text")
111
  file_input = gr.File(label="Upload PDF or DOCX")
112
- slider = gr.Slider(minimum=10, maximum=500, step=10, value=100, label="Summary Minimum Length")
113
- model_choice = gr.Dropdown(
114
- choices=["Default BART", "Legal Pegasus"],
115
- value="Default BART",
116
- label="Choose Summarization Model"
117
- )
118
 
119
  audio_output = gr.Audio(label="Generated Audio")
120
  summary_output = gr.Textbox(label="Generated Summary")
121
  pdf_output = gr.File(label="Summary PDF")
122
 
 
123
  process_sample_button.click(
124
  fn=process_sample_document,
125
- inputs=[slider, model_choice],
 
 
 
 
 
 
 
 
 
126
  outputs=[audio_output, summary_output, pdf_output]
127
  )
128
  file_input.change(
129
- fn=pdf_to_text,
130
- inputs=[text_input, file_input, slider, model_choice],
131
  outputs=[audio_output, summary_output, pdf_output]
132
  )
133
-
134
  if __name__ == "__main__":
135
  iface.launch()
 
1
  import gradio as gr
2
  import os
3
  import nltk
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModel
6
  from fpdf import FPDF
7
  from gtts import gTTS
8
  from pdfminer.high_level import extract_text
 
11
  from reportlab.pdfgen import canvas
12
 
13
  nltk.download('punkt')
14
+ from nltk.tokenize import sent_tokenize
15
 
16
+ # Load the LegalBERT model and tokenizer
17
+ tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
18
+ model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
 
 
 
 
 
19
 
20
  # Convert DOCX to PDF using ReportLab
21
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
22
  doc = Document(docx_file)
23
+ full_text = [para.text for para in doc.paragraphs]
 
 
24
 
25
  pdf = canvas.Canvas(output_pdf, pagesize=letter)
26
  pdf.setFont("Helvetica", 12)
27
 
28
+ text_object = pdf.beginText(40, 750)
29
  for line in full_text:
30
+ text_object.textLine(line)
31
 
32
+ pdf.drawText(text_object)
33
  pdf.save()
34
  return output_pdf
35
 
36
+ # Extractive summarization using LegalBERT
37
+ def extractive_summarization(text, num_sentences=5):
38
+ # Tokenize text into sentences
39
+ sentences = sent_tokenize(text)
40
+ # Handle case where document has fewer sentences than requested
41
+ num_sentences = min(num_sentences, len(sentences))
42
+ # Encode sentences
43
+ inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
44
+ with torch.no_grad():
45
+ outputs = model(**inputs)
46
+ # Get sentence embeddings by averaging token embeddings
47
+ embeddings = outputs.last_hidden_state.mean(dim=1)
48
+ # Compute similarity of each sentence to the document embedding
49
+ document_embedding = embeddings.mean(dim=0, keepdim=True)
50
+ similarities = torch.nn.functional.cosine_similarity(embeddings, document_embedding)
51
+ # Select top sentences based on similarity scores
52
+ top_k = torch.topk(similarities, k=num_sentences)
53
+ selected_indices = top_k.indices.sort().values # Sort indices to maintain original order
54
+ summary_sentences = [sentences[idx] for idx in selected_indices]
55
+ # Combine sentences into summary
56
+ summary = ' '.join(summary_sentences)
57
+ return summary
58
+
59
  # Process input file (PDF or DOCX)
60
+ def pdf_to_text(text, PDF, num_sentences=5):
61
  try:
62
+ if PDF is not None:
63
+ file_extension = os.path.splitext(PDF.name)[1].lower()
64
+ if file_extension == '.docx':
65
+ pdf_file_path = docx_to_pdf(PDF.name)
66
+ text = extract_text(pdf_file_path)
67
+ elif file_extension == '.pdf':
68
+ text = extract_text(PDF.name)
69
+ else:
70
+ return None, "Unsupported file type", None
71
+ elif text != "":
72
+ pass # Use the text input provided by the user
73
  else:
74
+ return None, "Please provide input text or upload a file.", None
 
 
 
75
 
76
+ summary = extractive_summarization(text, num_sentences)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # Generate a PDF of the summary
79
  pdf = FPDF()
80
  pdf.add_page()
81
  pdf.set_font("Times", size=12)
82
+ pdf.multi_cell(190, 10, txt=summary, align='L')
83
  pdf_output_path = "legal_summary.pdf"
84
  pdf.output(pdf_output_path)
85
 
86
  # Generate an audio file of the summary
87
  audio_output_path = "legal_summary.wav"
88
+ tts = gTTS(text=summary, lang='en', slow=False)
89
  tts.save(audio_output_path)
90
 
91
+ return audio_output_path, summary, pdf_output_path
92
 
93
  except Exception as e:
94
  return None, f"An error occurred: {str(e)}", None
95
 
96
  # Preloaded document handler
97
+ def process_sample_document(num_sentences=5):
98
  sample_document_path = "Marbury v. Madison.pdf"
 
99
  with open(sample_document_path, "rb") as f:
100
+ return pdf_to_text("", f, num_sentences)
101
 
102
  # Gradio interface
103
  with gr.Blocks() as iface:
 
106
 
107
  text_input = gr.Textbox(label="Input Text")
108
  file_input = gr.File(label="Upload PDF or DOCX")
109
+ slider = gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Number of Summary Sentences")
 
 
 
 
 
110
 
111
  audio_output = gr.Audio(label="Generated Audio")
112
  summary_output = gr.Textbox(label="Generated Summary")
113
  pdf_output = gr.File(label="Summary PDF")
114
 
115
+ # Update the function calls to match new parameters
116
  process_sample_button.click(
117
  fn=process_sample_document,
118
+ inputs=slider,
119
+ outputs=[audio_output, summary_output, pdf_output]
120
+ )
121
+ # Use submit event for the text input and file input
122
+ def on_submit(text, file, num_sentences):
123
+ return pdf_to_text(text, file, num_sentences)
124
+
125
+ text_input.submit(
126
+ fn=on_submit,
127
+ inputs=[text_input, file_input, slider],
128
  outputs=[audio_output, summary_output, pdf_output]
129
  )
130
  file_input.change(
131
+ fn=on_submit,
132
+ inputs=[text_input, file_input, slider],
133
  outputs=[audio_output, summary_output, pdf_output]
134
  )
135
+
136
  if __name__ == "__main__":
137
  iface.launch()