Spaces:
Running
Running
import gradio as gr | |
from pdf2docx import Converter | |
from docx import Document | |
import os | |
import glob | |
import base64 | |
from docx.shared import Inches, Pt | |
from docx.oxml import OxmlElement | |
from docx.enum.text import WD_ALIGN_PARAGRAPH | |
import xml.etree.ElementTree as ET | |
def find_ttf_fonts(): | |
files = glob.glob('**/*.ttf', recursive=True) | |
return files | |
def embed_font_in_html(font_path, font_name, html_content): | |
with open(font_path, "rb") as font_file: | |
font_data = font_file.read() | |
encoded_font = base64.b64encode(font_data).decode('utf-8') | |
font_style = f""" | |
<style> | |
@font-face {{ | |
font-family: '{font_name}'; | |
src: url(data:font/ttf;base64,{encoded_font}) format('truetype'); | |
}} | |
body {{ | |
font-family: '{font_name}', Arial, sans-serif; | |
margin: 0; | |
padding: 0; | |
background-color: white; | |
}} | |
.page {{ | |
position: relative; | |
width: 8.5in; | |
margin: 20px auto; | |
padding: 20px; | |
box-sizing: border-box; | |
background-color: white; | |
box-shadow: 0 0 10px rgba(0,0,0,0.1); | |
}} | |
.paragraph {{ | |
margin: 0; | |
padding: 0; | |
position: relative; | |
}} | |
.image-container {{ | |
display: inline-block; | |
position: relative; | |
vertical-align: middle; | |
}} | |
img {{ | |
max-width: 100%; | |
height: auto; | |
display: inline-block; | |
vertical-align: middle; | |
}} | |
table {{ | |
border-collapse: collapse; | |
width: 100%; | |
margin: 10px 0; | |
}} | |
td, th {{ | |
border: 1px solid black; | |
padding: 8px; | |
position: relative; | |
}} | |
</style> | |
""" | |
return font_style + html_content | |
def extract_images_from_doc(doc): | |
images = {} | |
for rel in doc.part.rels.values(): | |
if "image" in rel.reltype: | |
try: | |
image_data = rel.target_part.blob | |
image_type = rel.target_part.content_type.split('/')[-1] | |
if image_type.lower() not in ['jpeg', 'jpg', 'png', 'gif']: | |
image_type = 'png' | |
encoded_image = base64.b64encode(image_data).decode('utf-8') | |
images[rel.rId] = f"data:image/{image_type};base64,{encoded_image}" | |
except Exception as e: | |
print(f"Error processing image: {str(e)}") | |
continue | |
return images | |
def get_image_position(element): | |
try: | |
anchor = element.find('.//wp:anchor', | |
{'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) | |
if anchor is not None: | |
pos_h = anchor.find('.//wp:positionH', | |
{'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) | |
pos_v = anchor.find('.//wp:positionV', | |
{'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) | |
if pos_h is not None and pos_v is not None: | |
x = pos_h.find('.//wp:posOffset', | |
{'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) | |
y = pos_v.find('.//wp:posOffset', | |
{'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) | |
if x is not None and y is not None: | |
return { | |
'x': int(x.text) / 914400, | |
'y': int(y.text) / 914400 | |
} | |
except Exception: | |
pass | |
return None | |
def process_paragraph(paragraph, images_dict): | |
html_content = '<div class="paragraph">' | |
if paragraph.alignment == WD_ALIGN_PARAGRAPH.CENTER: | |
html_content += '<div style="text-align: center;">' | |
elif paragraph.alignment == WD_ALIGN_PARAGRAPH.RIGHT: | |
html_content += '<div style="text-align: right;">' | |
else: | |
html_content += '<div>' | |
for run in paragraph.runs: | |
style = [] | |
if run.bold: style.append('font-weight: bold') | |
if run.italic: style.append('font-style: italic') | |
if run.underline: style.append('text-decoration: underline') | |
if run.font.size: style.append(f'font-size: {run.font.size.pt}pt') | |
drawing_elements = run._element.findall('.//w:drawing', | |
{'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) | |
for drawing in drawing_elements: | |
blip = drawing.find('.//a:blip', | |
{'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}) | |
if blip is not None: | |
image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') | |
if image_rel_id in images_dict: | |
position = get_image_position(drawing) | |
if position: | |
style_pos = f"position: absolute; left: {position['x']}in; top: {position['y']}in;" | |
html_content += f'<div class="image-container" style="{style_pos}">' | |
html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>' | |
html_content += '</div>' | |
else: | |
html_content += f'<div class="image-container">' | |
html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>' | |
html_content += '</div>' | |
style_str = '; '.join(style) | |
if run.text.strip(): | |
html_content += f'<span style="{style_str}">{run.text}</span>' | |
html_content += '</div></div>' | |
return html_content | |
def process_table(table, images_dict): | |
html_content = '<table>' | |
for row in table.rows: | |
html_content += '<tr>' | |
for cell in row.cells: | |
html_content += '<td>' | |
for paragraph in cell.paragraphs: | |
for run in paragraph.runs: | |
style = [] | |
if run.bold: style.append('font-weight: bold') | |
if run.italic: style.append('font-style: italic') | |
if run.underline: style.append('text-decoration: underline') | |
if run.font.size: style.append(f'font-size: {run.font.size.pt}pt') | |
drawing_elements = run._element.findall('.//w:drawing', | |
{'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) | |
for drawing in drawing_elements: | |
blip = drawing.find('.//a:blip', | |
{'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}) | |
if blip is not None: | |
image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') | |
if image_rel_id in images_dict: | |
html_content += f'<div class="image-container">' | |
html_content += f'<img src="{images_dict[image_rel_id]}" alt="Table Cell Image"/>' | |
html_content += '</div>' | |
style_str = '; '.join(style) | |
if run.text.strip(): | |
html_content += f'<span style="{style_str}">{run.text}</span>' | |
html_content += '</td>' | |
html_content += '</tr>' | |
html_content += '</table>' | |
return html_content | |
def pdf_to_html(pdf_file, font_name): | |
if not pdf_file: | |
return None | |
try: | |
docx_filename = pdf_file.name.replace('.pdf', '.docx') | |
cv = Converter(pdf_file.name) | |
cv.convert(docx_filename) | |
cv.close() | |
doc = Document(docx_filename) | |
images_dict = extract_images_from_doc(doc) | |
html_content = """<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset='utf-8'> | |
<title>Converted Document</title> | |
</head> | |
<body> | |
<div class="page"> | |
""" | |
paragraph_map = {} | |
current_paragraph_index = 0 | |
for para in doc.paragraphs: | |
paragraph_map[para._element] = current_paragraph_index | |
current_paragraph_index += 1 | |
for element in doc.element.body: | |
if element.tag.endswith('p'): | |
if element in paragraph_map: | |
paragraph = doc.paragraphs[paragraph_map[element]] | |
html_content += process_paragraph(paragraph, images_dict) | |
elif element.tag.endswith('tbl'): | |
table_index = len([e for e in doc.element.body[:doc.element.body.index(element)] | |
if e.tag.endswith('tbl')]) | |
html_content += process_table(doc.tables[table_index], images_dict) | |
html_content += "</div></body></html>" | |
ttf_files = {os.path.basename(f): f for f in find_ttf_fonts()} | |
if font_name in ttf_files: | |
font_path = ttf_files[font_name] | |
font_name_clean = os.path.splitext(font_name)[0] | |
html_content = embed_font_in_html(font_path, font_name_clean, html_content) | |
html_filename = "output_with_font.html" | |
with open(html_filename, "w", encoding="utf-8") as html_file: | |
html_file.write(html_content) | |
os.remove(docx_filename) | |
return html_filename | |
except Exception as e: | |
print(f"Error in pdf_to_html: {str(e)}") | |
return None | |
# Gradio Interface | |
with gr.Blocks(theme=gr.themes.Soft()) as app: | |
gr.Markdown("# Bionic Reading PDF Converter") | |
gr.Markdown("### https://github.com/SanshruthR/Bionic_Reading_Hub") | |
with gr.Row(): | |
gr.Image("image.jpeg", | |
label="Bionic Reading Example", | |
show_label=False, | |
width=400, | |
height=300) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
pdf_input = gr.File( | |
label="Upload Your PDF", | |
file_types=[".pdf"], | |
file_count="single" | |
) | |
ttf_files = find_ttf_fonts() | |
font_dropdown = gr.Dropdown( | |
[os.path.basename(font) for font in ttf_files], | |
label="Select Font Style", | |
value=os.path.basename(ttf_files[0]) if ttf_files else None, | |
info="Choose your preferred reading font" | |
) | |
convert_pdf_to_html = gr.Button( | |
"Convert to Bionic Format", | |
variant="primary", | |
size="lg" | |
) | |
font_output = gr.File( | |
label="Download Enhanced HTML File", | |
type="filepath" | |
) | |
with gr.Row(): | |
example_files = [ | |
os.path.join("examples", f) | |
for f in os.listdir("examples") | |
if f.endswith('.pdf') | |
] if os.path.exists("examples") else [] | |
if example_files: | |
gr.Examples( | |
example_files, | |
pdf_input, | |
label="Sample PDFs" | |
) | |
with gr.Row(): | |
gr.Markdown( | |
""" | |
--- | |
📝 Best results with text-based PDFs (not scanned documents) | |
""" | |
) | |
convert_pdf_to_html.click( | |
pdf_to_html, | |
inputs=[pdf_input, font_dropdown], | |
outputs=[font_output] | |
) | |
app.launch() | |