Spaces:

arithescientist
/

lincolnlegal

Sleeping

File size: 3,445 Bytes

#**************** IMPORT PACKAGES ********************
import gradio as gr
import numpy as np
import pytesseract as pt
import pdf2image
from fpdf import FPDF
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import os
import pdfkit
import yake
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
from summarizer import Summarizer,TransformerSummarizer
from transformers import pipelines
nltk.download('punkt')

print("lets go")

def pdf(file):
  #model_name = 'laxya007/gpt2_legal'
  # model_name = 'facebook/bart-large-cnn'
  model_name = 'nlpaueb/legal-bert-base-uncased'

  # The setup of huggingface.co
  custom_config = AutoConfig.from_pretrained(model_name)
  custom_config.output_hidden_states=True
  custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
  custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
  bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)

  pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340))

  content = ""
  dir_name = 'images/' + file + '/' 
  os.makedirs(dir_name, exist_ok=True)
  # If folder doesn't exist, then create it.
  for i in range(len(pages)-1):
      pages[i].save(dir_name + str(i) + '.jpg')
      # OCR the image using Google's tesseract
      content += pt.image_to_string(pages[i])

  summary_text = ""
  for i, paragraph in enumerate(content.split("\n\n")):
    
      paragraph = paragraph.replace('\n',' ')
      paragraph = paragraph.replace('\t','')
      paragraph = ' '.join(paragraph.split())
      # count words in the paragraph and exclude if less than 4 words
      tokens = word_tokenize(paragraph)
      # only do real words
      tokens = [word for word in tokens if word.isalpha()]
      # print("\nTokens: {}\n".format(len(tokens)))
      # only do sentences with more than 1 words excl. alpha crap
      if len(tokens) <= 1:
          continue
      # Perhaps also ignore paragraphs with no sentence?
      sentences = sent_tokenize(paragraph)
      
      paragraph = ' '.join(tokens)

      print("\nParagraph:")
      print(paragraph+"\n")
      # T5 needs to have 'summarize' in order to work:
      # text = "summarize:" + paragraph
      text = paragraph
      
      summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
          # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
      summary_text += str(summary) + "\n\n"
      print("Summary:")
      print(summary)

  content2 = content.replace('\n',' ')
  content2 = content2.replace('\t','')
  summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
  


  # write all to file for inspection and storage
  all_text = "The Summary-- " + str(summary) + "\n\n\n" \
      + "The Larger Summary-- " + str(summary_text)
      

  all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
  all_text2 = all_text2.replace('?','.')
  all_text2 = all_text2.replace('\n',' ')
  all_text2 = all_text2.replace('..','.')
  all_text2 = all_text2.replace(',.',',')
  all_text2 = all_text2.replace('-- ','\n\n\n')

  pdf = FPDF()  

  # Add a page
  pdf.add_page()

  pdf.set_font("Times", size = 12)

  # open the text file in read mode
  f = all_text2
  return f





iface = gr.Interface(
    pdf, 
    "file", 
    "text"
   )

if __name__ == "__main__":
    iface.launch(share=True)