Sanshruth's picture
Update app.py
6901b49 verified
import gradio as gr
from pdf2docx import Converter
from docx import Document
import os
import glob
import base64
from docx.shared import Inches, Pt
from docx.oxml import OxmlElement
from docx.enum.text import WD_ALIGN_PARAGRAPH
import xml.etree.ElementTree as ET
def find_ttf_fonts():
files = glob.glob('**/*.ttf', recursive=True)
return files
def embed_font_in_html(font_path, font_name, html_content):
with open(font_path, "rb") as font_file:
font_data = font_file.read()
encoded_font = base64.b64encode(font_data).decode('utf-8')
font_style = f"""
<style>
@font-face {{
font-family: '{font_name}';
src: url(data:font/ttf;base64,{encoded_font}) format('truetype');
}}
body {{
font-family: '{font_name}', Arial, sans-serif;
margin: 0;
padding: 0;
background-color: white;
}}
.page {{
position: relative;
width: 8.5in;
margin: 20px auto;
padding: 20px;
box-sizing: border-box;
background-color: white;
box-shadow: 0 0 10px rgba(0,0,0,0.1);
}}
.paragraph {{
margin: 0;
padding: 0;
position: relative;
}}
.image-container {{
display: inline-block;
position: relative;
vertical-align: middle;
}}
img {{
max-width: 100%;
height: auto;
display: inline-block;
vertical-align: middle;
}}
table {{
border-collapse: collapse;
width: 100%;
margin: 10px 0;
}}
td, th {{
border: 1px solid black;
padding: 8px;
position: relative;
}}
</style>
"""
return font_style + html_content
def extract_images_from_doc(doc):
images = {}
for rel in doc.part.rels.values():
if "image" in rel.reltype:
try:
image_data = rel.target_part.blob
image_type = rel.target_part.content_type.split('/')[-1]
if image_type.lower() not in ['jpeg', 'jpg', 'png', 'gif']:
image_type = 'png'
encoded_image = base64.b64encode(image_data).decode('utf-8')
images[rel.rId] = f"data:image/{image_type};base64,{encoded_image}"
except Exception as e:
print(f"Error processing image: {str(e)}")
continue
return images
def get_image_position(element):
try:
anchor = element.find('.//wp:anchor',
{'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
if anchor is not None:
pos_h = anchor.find('.//wp:positionH',
{'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
pos_v = anchor.find('.//wp:positionV',
{'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
if pos_h is not None and pos_v is not None:
x = pos_h.find('.//wp:posOffset',
{'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
y = pos_v.find('.//wp:posOffset',
{'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
if x is not None and y is not None:
return {
'x': int(x.text) / 914400,
'y': int(y.text) / 914400
}
except Exception:
pass
return None
def process_paragraph(paragraph, images_dict):
html_content = '<div class="paragraph">'
if paragraph.alignment == WD_ALIGN_PARAGRAPH.CENTER:
html_content += '<div style="text-align: center;">'
elif paragraph.alignment == WD_ALIGN_PARAGRAPH.RIGHT:
html_content += '<div style="text-align: right;">'
else:
html_content += '<div>'
for run in paragraph.runs:
style = []
if run.bold: style.append('font-weight: bold')
if run.italic: style.append('font-style: italic')
if run.underline: style.append('text-decoration: underline')
if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')
drawing_elements = run._element.findall('.//w:drawing',
{'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
for drawing in drawing_elements:
blip = drawing.find('.//a:blip',
{'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
if blip is not None:
image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if image_rel_id in images_dict:
position = get_image_position(drawing)
if position:
style_pos = f"position: absolute; left: {position['x']}in; top: {position['y']}in;"
html_content += f'<div class="image-container" style="{style_pos}">'
html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>'
html_content += '</div>'
else:
html_content += f'<div class="image-container">'
html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>'
html_content += '</div>'
style_str = '; '.join(style)
if run.text.strip():
html_content += f'<span style="{style_str}">{run.text}</span>'
html_content += '</div></div>'
return html_content
def process_table(table, images_dict):
html_content = '<table>'
for row in table.rows:
html_content += '<tr>'
for cell in row.cells:
html_content += '<td>'
for paragraph in cell.paragraphs:
for run in paragraph.runs:
style = []
if run.bold: style.append('font-weight: bold')
if run.italic: style.append('font-style: italic')
if run.underline: style.append('text-decoration: underline')
if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')
drawing_elements = run._element.findall('.//w:drawing',
{'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
for drawing in drawing_elements:
blip = drawing.find('.//a:blip',
{'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
if blip is not None:
image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if image_rel_id in images_dict:
html_content += f'<div class="image-container">'
html_content += f'<img src="{images_dict[image_rel_id]}" alt="Table Cell Image"/>'
html_content += '</div>'
style_str = '; '.join(style)
if run.text.strip():
html_content += f'<span style="{style_str}">{run.text}</span>'
html_content += '</td>'
html_content += '</tr>'
html_content += '</table>'
return html_content
def pdf_to_html(pdf_file, font_name):
if not pdf_file:
return None
try:
docx_filename = pdf_file.name.replace('.pdf', '.docx')
cv = Converter(pdf_file.name)
cv.convert(docx_filename)
cv.close()
doc = Document(docx_filename)
images_dict = extract_images_from_doc(doc)
html_content = """<!DOCTYPE html>
<html>
<head>
<meta charset='utf-8'>
<title>Converted Document</title>
</head>
<body>
<div class="page">
"""
paragraph_map = {}
current_paragraph_index = 0
for para in doc.paragraphs:
paragraph_map[para._element] = current_paragraph_index
current_paragraph_index += 1
for element in doc.element.body:
if element.tag.endswith('p'):
if element in paragraph_map:
paragraph = doc.paragraphs[paragraph_map[element]]
html_content += process_paragraph(paragraph, images_dict)
elif element.tag.endswith('tbl'):
table_index = len([e for e in doc.element.body[:doc.element.body.index(element)]
if e.tag.endswith('tbl')])
html_content += process_table(doc.tables[table_index], images_dict)
html_content += "</div></body></html>"
ttf_files = {os.path.basename(f): f for f in find_ttf_fonts()}
if font_name in ttf_files:
font_path = ttf_files[font_name]
font_name_clean = os.path.splitext(font_name)[0]
html_content = embed_font_in_html(font_path, font_name_clean, html_content)
html_filename = "output_with_font.html"
with open(html_filename, "w", encoding="utf-8") as html_file:
html_file.write(html_content)
os.remove(docx_filename)
return html_filename
except Exception as e:
print(f"Error in pdf_to_html: {str(e)}")
return None
# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("# Bionic Reading PDF Converter")
gr.Markdown("### https://github.com/SanshruthR/Bionic_Reading_Hub")
with gr.Row():
gr.Image("image.jpeg",
label="Bionic Reading Example",
show_label=False,
width=400,
height=300)
with gr.Row():
with gr.Column(scale=2):
pdf_input = gr.File(
label="Upload Your PDF",
file_types=[".pdf"],
file_count="single"
)
ttf_files = find_ttf_fonts()
font_dropdown = gr.Dropdown(
[os.path.basename(font) for font in ttf_files],
label="Select Font Style",
value=os.path.basename(ttf_files[0]) if ttf_files else None,
info="Choose your preferred reading font"
)
convert_pdf_to_html = gr.Button(
"Convert to Bionic Format",
variant="primary",
size="lg"
)
font_output = gr.File(
label="Download Enhanced HTML File",
type="filepath"
)
with gr.Row():
example_files = [
os.path.join("examples", f)
for f in os.listdir("examples")
if f.endswith('.pdf')
] if os.path.exists("examples") else []
if example_files:
gr.Examples(
example_files,
pdf_input,
label="Sample PDFs"
)
with gr.Row():
gr.Markdown(
"""
---
📝 Best results with text-based PDFs (not scanned documents)
"""
)
convert_pdf_to_html.click(
pdf_to_html,
inputs=[pdf_input, font_dropdown],
outputs=[font_output]
)
app.launch()