pdf-reader / app.py
Omnibus's picture
Update app.py
73570f9
import gradio as gr
import requests
from pypdf import PdfReader
import pypdfium2 as pdfium
import easyocr
ocr_id = {
"Afrikaans": "af",
"Albanian": "sq",
"Arabic": "ar",
"Azerbaijani": "az",
"Belarusian": "be",
"Bulgarian": "bg",
"Bengali": "bn",
"Bosnian": "bs",
"Chinese (simplified)": "ch_sim",
"Chinese (traditional)": "ch_tra",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"French": "fr",
"German": "de",
"Irish": "ga",
"Hindi": "hi",
"Hungarian": "hu",
"Indonesian": "id",
"Icelandic": "is",
"Italian": "it",
"Japanese": "ja",
"Kannada": "kn",
"Korean": "ko",
"Lithuanian": "lt",
"Latvian": "lv",
"Mongolian": "mn",
"Marathi": "mr",
"Malay": "ms",
"Nepali": "ne",
"Norwegian": "no",
"Occitan": "oc",
"Polish": "pl",
"Portuguese": "pt",
"Romanian": "ro",
"Russian": "ru",
"Serbian (cyrillic)": "rs_cyrillic",
"Serbian (latin)": "rs_latin",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Swedish": "sv",
"Swahili": "sw",
"Tamil": "ta",
"Thai": "th",
"Tagalog": "tl",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Vietnamese": "vi",
"Welsh": "cy",
"Zulu": "zu",
}
def pdf_pil(file_path,page_num,up_scale):
pdf = pdfium.PdfDocument("data.pdf")
page = pdf.get_page(int(page_num)-1)
bitmap = page.render(
scale = int(up_scale), # 72dpi resolution
rotation = 0, # no additional rotation
# ... further rendering options
)
pil_image = bitmap.to_pil()
pil_image.save(f"image_{page_num}.png")
return (f"image_{page_num}.png")
def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det,up_scale):
img1 = pdf_pil(file_path,page_num,up_scale)
lang=[f"{ocr_id[pdf_lang]}"]
reader = easyocr.Reader(lang)
bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det)
this = ""
for bound in bounds:
this = (f'{this} \n{bound[1]}')
return this
def scrape(instring):
html_src=(f'''
<div style="text-align:center">
<h4>PDF Viewer</h4>
<iframe src="https://docs.google.com/viewer?url={instring}&embedded=true" frameborder="0" height="1200px" width="100%"></iframe>
</div>''')
return gr.HTML.update(f'''{html_src}''')
def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det,up_scale):
response = requests.get(instring, stream=True)
if response.status_code == 200:
with open("data.pdf", "wb") as f:
f.write(response.content)
else:
print(response.status_code)
#out = Path("./data.pdf")
#print (out)
reader = PdfReader("data.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[int(page_num)-1]
text = page.extract_text()
print (text)
summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
try:
sum_out = summarizer(text)
except Exception:
try:
text = ocrpdf("data.pdf",pdf_lang,page_num,sent_wid,contrast_det,up_scale)
sum_out = summarizer(text)
except Exception:
sum_out = "Error"
return text, sum_out,gr.Markdown.update("""<h3> Complete""")
with gr.Blocks() as app:
gr.Markdown('''<h1>PDF Viewer''')
with gr.Row():
inp=gr.Textbox(label="PDF URL",scale=3)
pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1)
with gr.Tab("View PDF"):
go_btn = gr.Button("Load PDF")
outp = gr.HTML()
with gr.Tab("Summarize"):
mes = gr.Markdown("""<h3> Summarize Text in PDF""")
with gr.Row():
with gr.Box():
with gr.Column():
sent_wid=gr.Slider(0.1, 3, step=0.1,value=1,label="Horizontal Word Space")
contrast_det=gr.Slider(0.1, 1, step=0.1,value=0.1,label="Contrast Threshold")
with gr.Column():
up_scale=gr.Slider(0.1, 5, step=0.1,value=1,label="PDF to Image Scale")
with gr.Column():
target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
sum_btn = gr.Button("Summarize")
with gr.Row():
text_out = gr.Textbox()
sum_out = gr.Textbox()
go_btn.click(scrape,inp,outp)
sum_btn.click(scrape00,[inp,pg_num,target_lang,sent_wid,contrast_det,up_scale],[text_out,sum_out,mes])
app.queue(concurrency_count=10).launch()