Spaces:
Sleeping
Sleeping
File size: 3,445 Bytes
0751294 17e34a5 0751294 342a4a2 0751294 342a4a2 cf3e244 342a4a2 132254a 342a4a2 cf3e244 342a4a2 cf3e244 342a4a2 cf3e244 342a4a2 cf3e244 342a4a2 cf3e244 342a4a2 cf3e244 342a4a2 cf3e244 0751294 17e34a5 0751294 342a4a2 cf3e244 342a4a2 0751294 342a4a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
#**************** IMPORT PACKAGES ********************
import gradio as gr
import numpy as np
import pytesseract as pt
import pdf2image
from fpdf import FPDF
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import os
import pdfkit
import yake
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
from summarizer import Summarizer,TransformerSummarizer
from transformers import pipelines
nltk.download('punkt')
print("lets go")
def pdf(file):
#model_name = 'laxya007/gpt2_legal'
# model_name = 'facebook/bart-large-cnn'
model_name = 'nlpaueb/legal-bert-base-uncased'
# The setup of huggingface.co
custom_config = AutoConfig.from_pretrained(model_name)
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340))
content = ""
dir_name = 'images/' + file + '/'
os.makedirs(dir_name, exist_ok=True)
# If folder doesn't exist, then create it.
for i in range(len(pages)-1):
pages[i].save(dir_name + str(i) + '.jpg')
# OCR the image using Google's tesseract
content += pt.image_to_string(pages[i])
summary_text = ""
for i, paragraph in enumerate(content.split("\n\n")):
paragraph = paragraph.replace('\n',' ')
paragraph = paragraph.replace('\t','')
paragraph = ' '.join(paragraph.split())
# count words in the paragraph and exclude if less than 4 words
tokens = word_tokenize(paragraph)
# only do real words
tokens = [word for word in tokens if word.isalpha()]
# print("\nTokens: {}\n".format(len(tokens)))
# only do sentences with more than 1 words excl. alpha crap
if len(tokens) <= 1:
continue
# Perhaps also ignore paragraphs with no sentence?
sentences = sent_tokenize(paragraph)
paragraph = ' '.join(tokens)
print("\nParagraph:")
print(paragraph+"\n")
# T5 needs to have 'summarize' in order to work:
# text = "summarize:" + paragraph
text = paragraph
summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
summary_text += str(summary) + "\n\n"
print("Summary:")
print(summary)
content2 = content.replace('\n',' ')
content2 = content2.replace('\t','')
summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
# write all to file for inspection and storage
all_text = "The Summary-- " + str(summary) + "\n\n\n" \
+ "The Larger Summary-- " + str(summary_text)
all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
all_text2 = all_text2.replace('?','.')
all_text2 = all_text2.replace('\n',' ')
all_text2 = all_text2.replace('..','.')
all_text2 = all_text2.replace(',.',',')
all_text2 = all_text2.replace('-- ','\n\n\n')
pdf = FPDF()
# Add a page
pdf.add_page()
pdf.set_font("Times", size = 12)
# open the text file in read mode
f = all_text2
return f
iface = gr.Interface(
pdf,
"file",
"text"
)
if __name__ == "__main__":
iface.launch(share=True) |