Spaces:
Running
Running
import gradio as gr | |
import requests | |
from pypdf import PdfReader | |
import pypdfium2 as pdfium | |
import easyocr | |
ocr_id = { | |
"Afrikaans": "af", | |
"Albanian": "sq", | |
"Arabic": "ar", | |
"Azerbaijani": "az", | |
"Belarusian": "be", | |
"Bulgarian": "bg", | |
"Bengali": "bn", | |
"Bosnian": "bs", | |
"Chinese (simplified)": "ch_sim", | |
"Chinese (traditional)": "ch_tra", | |
"Croatian": "hr", | |
"Czech": "cs", | |
"Danish": "da", | |
"Dutch": "nl", | |
"English": "en", | |
"Estonian": "et", | |
"French": "fr", | |
"German": "de", | |
"Irish": "ga", | |
"Hindi": "hi", | |
"Hungarian": "hu", | |
"Indonesian": "id", | |
"Icelandic": "is", | |
"Italian": "it", | |
"Japanese": "ja", | |
"Kannada": "kn", | |
"Korean": "ko", | |
"Lithuanian": "lt", | |
"Latvian": "lv", | |
"Mongolian": "mn", | |
"Marathi": "mr", | |
"Malay": "ms", | |
"Nepali": "ne", | |
"Norwegian": "no", | |
"Occitan": "oc", | |
"Polish": "pl", | |
"Portuguese": "pt", | |
"Romanian": "ro", | |
"Russian": "ru", | |
"Serbian (cyrillic)": "rs_cyrillic", | |
"Serbian (latin)": "rs_latin", | |
"Slovak": "sk", | |
"Slovenian": "sl", | |
"Spanish": "es", | |
"Swedish": "sv", | |
"Swahili": "sw", | |
"Tamil": "ta", | |
"Thai": "th", | |
"Tagalog": "tl", | |
"Turkish": "tr", | |
"Ukrainian": "uk", | |
"Urdu": "ur", | |
"Uzbek": "uz", | |
"Vietnamese": "vi", | |
"Welsh": "cy", | |
"Zulu": "zu", | |
} | |
def pdf_pil(file_path,page_num,up_scale): | |
pdf = pdfium.PdfDocument("data.pdf") | |
page = pdf.get_page(int(page_num)-1) | |
bitmap = page.render( | |
scale = int(up_scale), # 72dpi resolution | |
rotation = 0, # no additional rotation | |
# ... further rendering options | |
) | |
pil_image = bitmap.to_pil() | |
pil_image.save(f"image_{page_num}.png") | |
return (f"image_{page_num}.png") | |
def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det,up_scale): | |
img1 = pdf_pil(file_path,page_num,up_scale) | |
lang=[f"{ocr_id[pdf_lang]}"] | |
reader = easyocr.Reader(lang) | |
bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det) | |
this = "" | |
for bound in bounds: | |
this = (f'{this} \n{bound[1]}') | |
return this | |
def scrape(instring): | |
html_src=(f''' | |
<div style="text-align:center"> | |
<h4>PDF Viewer</h4> | |
<iframe src="https://docs.google.com/viewer?url={instring}&embedded=true" frameborder="0" height="1200px" width="100%"></iframe> | |
</div>''') | |
return gr.HTML.update(f'''{html_src}''') | |
def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det,up_scale): | |
response = requests.get(instring, stream=True) | |
if response.status_code == 200: | |
with open("data.pdf", "wb") as f: | |
f.write(response.content) | |
else: | |
print(response.status_code) | |
#out = Path("./data.pdf") | |
#print (out) | |
reader = PdfReader("data.pdf") | |
number_of_pages = len(reader.pages) | |
page = reader.pages[int(page_num)-1] | |
text = page.extract_text() | |
print (text) | |
summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn") | |
try: | |
sum_out = summarizer(text) | |
except Exception: | |
try: | |
text = ocrpdf("data.pdf",pdf_lang,page_num,sent_wid,contrast_det,up_scale) | |
sum_out = summarizer(text) | |
except Exception: | |
sum_out = "Error" | |
return text, sum_out,gr.Markdown.update("""<h3> Complete""") | |
with gr.Blocks() as app: | |
gr.Markdown('''<h1>PDF Viewer''') | |
with gr.Row(): | |
inp=gr.Textbox(label="PDF URL",scale=3) | |
pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1) | |
with gr.Tab("View PDF"): | |
go_btn = gr.Button("Load PDF") | |
outp = gr.HTML() | |
with gr.Tab("Summarize"): | |
mes = gr.Markdown("""<h3> Summarize Text in PDF""") | |
with gr.Row(): | |
with gr.Box(): | |
with gr.Column(): | |
sent_wid=gr.Slider(0.1, 3, step=0.1,value=1,label="Horizontal Word Space") | |
contrast_det=gr.Slider(0.1, 1, step=0.1,value=0.1,label="Contrast Threshold") | |
with gr.Column(): | |
up_scale=gr.Slider(0.1, 5, step=0.1,value=1,label="PDF to Image Scale") | |
with gr.Column(): | |
target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English") | |
sum_btn = gr.Button("Summarize") | |
with gr.Row(): | |
text_out = gr.Textbox() | |
sum_out = gr.Textbox() | |
go_btn.click(scrape,inp,outp) | |
sum_btn.click(scrape00,[inp,pg_num,target_lang,sent_wid,contrast_det,up_scale],[text_out,sum_out,mes]) | |
app.queue(concurrency_count=10).launch() |