Spaces:
Sleeping
Sleeping
File size: 5,264 Bytes
73683aa 0cc1374 73683aa 6b438f3 73683aa 6b438f3 b07522c 6b438f3 b07522c 73683aa 6b438f3 2dbedf0 6b438f3 a011e6d 6b438f3 a011e6d 73683aa 6b438f3 73683aa 6b438f3 a73ec05 6b438f3 a73ec05 6b438f3 a73ec05 73683aa 6b438f3 73683aa a73ec05 73683aa a73ec05 6b438f3 a73ec05 73683aa 0a681f9 0cc1374 a232e1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# app.py
import os
import gradio as gr
from gradio_pdf import PDF
import logging
from model import model_initialized
from pdf_processor import to_pdf, to_markdown
from config import config
from tts import text_to_speech # Import TTS module
# Set up logging with ANSI escape codes for colored output
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
def log_info(message: str):
logging.info(f"\033[92m{message}\033[0m") # Green for info
def log_error(message: str):
logging.error(f"\033[91m{message}\033[0m") # Red for errors
# Load header HTML content
try:
with open("header.html", "r") as file:
header = file.read()
log_info("Header loaded successfully.")
except Exception as e:
log_error(f"Failed to load header.html. Error: {e}")
header = "<h1>Header not found</h1>"
# Define language options
latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', 'sa', 'bgc']
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
def file_to_pdf(file_obj):
if file_obj is not None:
try:
pdf_path = to_pdf(file_obj.name)
log_info("File converted to PDF successfully.")
return pdf_path
except Exception as e:
log_error(f"Error converting file to PDF: {e}")
return None
def generate_audio(text: str) -> str:
"""
Converts the provided text to speech and returns the path of the audio file.
"""
if text:
try:
audio_file = text_to_speech(text)
log_info("Audio generated successfully.")
return audio_file
except Exception as e:
log_error(f"Audio generation failed: {e}")
return ""
log_error("No text provided for TTS.")
return ""
with gr.Blocks() as demo:
gr.HTML(header)
with gr.Row():
with gr.Column(variant='panel', scale=5):
file_input = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
max_pages = gr.Slider(1, 20, config.get("max_pages_default", 10), step=1, label='Max convert pages')
with gr.Row():
layout_mode = gr.Dropdown(
["layoutlmv3", "doclayout_yolo"],
label="Layout model",
value=config.get("layout_model_default", "doclayout_yolo")
)
language = gr.Dropdown(
all_lang,
label="Language",
value=config.get("language_default", "auto")
)
with gr.Row():
formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
table_enable = gr.Checkbox(label="Enable table recognition", value=True)
with gr.Row():
convert_button = gr.Button("Convert")
clear_button = gr.ClearButton(value="Clear")
pdf_display = PDF(label='PDF preview', interactive=False, visible=True, height=800)
with gr.Accordion("Examples:"):
example_root = os.path.join(os.path.dirname(__file__), "examples")
examples = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith("pdf")]
gr.Examples(examples=examples, inputs=file_input)
with gr.Column(variant='panel', scale=5):
output_file = gr.File(label="Convert result", interactive=False)
with gr.Tabs():
with gr.Tab("Markdown rendering"):
md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
with gr.Tab("Markdown text"):
md_text = gr.TextArea(lines=45, show_copy_button=True)
# Audio component for TTS playback
audio_output = gr.Audio(label="Read Aloud", type="filepath")
read_button = gr.Button("Read Aloud")
file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
convert_button.click(
fn=to_markdown,
inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
outputs=[md_render, md_text, output_file, pdf_display]
)
# When "Read Aloud" is clicked, generate audio from the markdown text
read_button.click(
fn=generate_audio,
inputs=md_text,
outputs=audio_output
)
clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
if __name__ == "__main__":
demo.launch(ssr_mode=True)
|