Spaces:
Running
Running
File size: 4,631 Bytes
3919e25 2e2a7b2 4ffb3fe 9774c1c 456846b 9774c1c 5135496 d171ec8 8d346b2 456846b 8d346b2 9774c1c 8d346b2 de8ef09 9774c1c de8ef09 9774c1c 456846b 9774c1c bdb82f0 47a5a07 b4be601 9774c1c 9c777f4 352bdf6 d171ec8 a923971 3919e25 2d1281f a923971 d599b56 d02b2ab a923971 456846b 2e2a7b2 a923971 e86a2c5 2e2a7b2 b7f89cc 2e2a7b2 1c5b68c 456846b fb04ca9 352bdf6 456846b 352bdf6 456846b 373d23c 8f70505 502b110 d599b56 d846da3 90fd9f6 9c7f619 90fd9f6 9774c1c 90fd9f6 5470d55 90fd9f6 e8481be 73570f9 e8481be 90fd9f6 ba73e05 1b2fcc0 3919e25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import gradio as gr
import requests
from pypdf import PdfReader
import pypdfium2 as pdfium
import easyocr
ocr_id = {
"Afrikaans": "af",
"Albanian": "sq",
"Arabic": "ar",
"Azerbaijani": "az",
"Belarusian": "be",
"Bulgarian": "bg",
"Bengali": "bn",
"Bosnian": "bs",
"Chinese (simplified)": "ch_sim",
"Chinese (traditional)": "ch_tra",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"French": "fr",
"German": "de",
"Irish": "ga",
"Hindi": "hi",
"Hungarian": "hu",
"Indonesian": "id",
"Icelandic": "is",
"Italian": "it",
"Japanese": "ja",
"Kannada": "kn",
"Korean": "ko",
"Lithuanian": "lt",
"Latvian": "lv",
"Mongolian": "mn",
"Marathi": "mr",
"Malay": "ms",
"Nepali": "ne",
"Norwegian": "no",
"Occitan": "oc",
"Polish": "pl",
"Portuguese": "pt",
"Romanian": "ro",
"Russian": "ru",
"Serbian (cyrillic)": "rs_cyrillic",
"Serbian (latin)": "rs_latin",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Swedish": "sv",
"Swahili": "sw",
"Tamil": "ta",
"Thai": "th",
"Tagalog": "tl",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Vietnamese": "vi",
"Welsh": "cy",
"Zulu": "zu",
}
def pdf_pil(file_path,page_num,up_scale):
pdf = pdfium.PdfDocument("data.pdf")
page = pdf.get_page(int(page_num)-1)
bitmap = page.render(
scale = int(up_scale), # 72dpi resolution
rotation = 0, # no additional rotation
# ... further rendering options
)
pil_image = bitmap.to_pil()
pil_image.save(f"image_{page_num}.png")
return (f"image_{page_num}.png")
def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det,up_scale):
img1 = pdf_pil(file_path,page_num,up_scale)
lang=[f"{ocr_id[pdf_lang]}"]
reader = easyocr.Reader(lang)
bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det)
this = ""
for bound in bounds:
this = (f'{this} \n{bound[1]}')
return this
def scrape(instring):
html_src=(f'''
<div style="text-align:center">
<h4>PDF Viewer</h4>
<iframe src="https://docs.google.com/viewer?url={instring}&embedded=true" frameborder="0" height="1200px" width="100%"></iframe>
</div>''')
return gr.HTML.update(f'''{html_src}''')
def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det,up_scale):
response = requests.get(instring, stream=True)
if response.status_code == 200:
with open("data.pdf", "wb") as f:
f.write(response.content)
else:
print(response.status_code)
#out = Path("./data.pdf")
#print (out)
reader = PdfReader("data.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[int(page_num)-1]
text = page.extract_text()
print (text)
summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
try:
sum_out = summarizer(text)
except Exception:
try:
text = ocrpdf("data.pdf",pdf_lang,page_num,sent_wid,contrast_det,up_scale)
sum_out = summarizer(text)
except Exception:
sum_out = "Error"
return text, sum_out,gr.Markdown.update("""<h3> Complete""")
with gr.Blocks() as app:
gr.Markdown('''<h1>PDF Viewer''')
with gr.Row():
inp=gr.Textbox(label="PDF URL",scale=3)
pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1)
with gr.Tab("View PDF"):
go_btn = gr.Button("Load PDF")
outp = gr.HTML()
with gr.Tab("Summarize"):
mes = gr.Markdown("""<h3> Summarize Text in PDF""")
with gr.Row():
with gr.Box():
with gr.Column():
sent_wid=gr.Slider(0.1, 3, step=0.1,value=1,label="Horizontal Word Space")
contrast_det=gr.Slider(0.1, 1, step=0.1,value=0.1,label="Contrast Threshold")
with gr.Column():
up_scale=gr.Slider(0.1, 5, step=0.1,value=1,label="PDF to Image Scale")
with gr.Column():
target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
sum_btn = gr.Button("Summarize")
with gr.Row():
text_out = gr.Textbox()
sum_out = gr.Textbox()
go_btn.click(scrape,inp,outp)
sum_btn.click(scrape00,[inp,pg_num,target_lang,sent_wid,contrast_det,up_scale],[text_out,sum_out,mes])
app.queue(concurrency_count=10).launch() |