File size: 3,445 Bytes
0751294
17e34a5
0751294
 
 
 
 
 
 
 
 
 
 
 
 
 
342a4a2
0751294
 
 
342a4a2
 
 
 
 
 
 
 
 
 
 
cf3e244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342a4a2
132254a
 
 
342a4a2
cf3e244
 
 
 
342a4a2
 
cf3e244
 
 
 
342a4a2
cf3e244
 
 
 
 
 
342a4a2
cf3e244
342a4a2
cf3e244
 
342a4a2
cf3e244
342a4a2
cf3e244
 
 
0751294
17e34a5
0751294
 
 
342a4a2
 
cf3e244
 
342a4a2
0751294
342a4a2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#**************** IMPORT PACKAGES ********************
import gradio as gr
import numpy as np
import pytesseract as pt
import pdf2image
from fpdf import FPDF
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import os
import pdfkit
import yake
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
from summarizer import Summarizer,TransformerSummarizer
from transformers import pipelines
nltk.download('punkt')

print("lets go")

def pdf(file):
  #model_name = 'laxya007/gpt2_legal'
  # model_name = 'facebook/bart-large-cnn'
  model_name = 'nlpaueb/legal-bert-base-uncased'

  # The setup of huggingface.co
  custom_config = AutoConfig.from_pretrained(model_name)
  custom_config.output_hidden_states=True
  custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
  custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
  bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)

  pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340))

  content = ""
  dir_name = 'images/' + file + '/' 
  os.makedirs(dir_name, exist_ok=True)
  # If folder doesn't exist, then create it.
  for i in range(len(pages)-1):
      pages[i].save(dir_name + str(i) + '.jpg')
      # OCR the image using Google's tesseract
      content += pt.image_to_string(pages[i])

  summary_text = ""
  for i, paragraph in enumerate(content.split("\n\n")):
    
      paragraph = paragraph.replace('\n',' ')
      paragraph = paragraph.replace('\t','')
      paragraph = ' '.join(paragraph.split())
      # count words in the paragraph and exclude if less than 4 words
      tokens = word_tokenize(paragraph)
      # only do real words
      tokens = [word for word in tokens if word.isalpha()]
      # print("\nTokens: {}\n".format(len(tokens)))
      # only do sentences with more than 1 words excl. alpha crap
      if len(tokens) <= 1:
          continue
      # Perhaps also ignore paragraphs with no sentence?
      sentences = sent_tokenize(paragraph)
      
      paragraph = ' '.join(tokens)

      print("\nParagraph:")
      print(paragraph+"\n")
      # T5 needs to have 'summarize' in order to work:
      # text = "summarize:" + paragraph
      text = paragraph
      
      summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
          # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
      summary_text += str(summary) + "\n\n"
      print("Summary:")
      print(summary)

  content2 = content.replace('\n',' ')
  content2 = content2.replace('\t','')
  summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
  


  # write all to file for inspection and storage
  all_text = "The Summary-- " + str(summary) + "\n\n\n" \
      + "The Larger Summary-- " + str(summary_text)
      

  all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
  all_text2 = all_text2.replace('?','.')
  all_text2 = all_text2.replace('\n',' ')
  all_text2 = all_text2.replace('..','.')
  all_text2 = all_text2.replace(',.',',')
  all_text2 = all_text2.replace('-- ','\n\n\n')

  pdf = FPDF()  

  # Add a page
  pdf.add_page()

  pdf.set_font("Times", size = 12)

  # open the text file in read mode
  f = all_text2
  return f





iface = gr.Interface(
    pdf, 
    "file", 
    "text"
   )

if __name__ == "__main__":
    iface.launch(share=True)