File size: 4,778 Bytes
73683aa
0cc1374
 
a73ec05
 
73683aa
a73ec05
73683aa
 
a73ec05
b07522c
73683aa
 
b07522c
73683aa
a73ec05
2dbedf0
 
a73ec05
 
 
 
 
a011e6d
a73ec05
 
 
 
a011e6d
 
73683aa
 
a73ec05
73683aa
 
 
 
 
a73ec05
 
 
 
 
 
 
 
 
 
 
 
 
 
73683aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a73ec05
 
 
73683aa
a73ec05
73683aa
a73ec05
73683aa
 
 
 
 
a73ec05
 
 
73683aa
0a681f9
0cc1374
a232e1e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# app.py
import os
import gradio as gr
import logging
import tempfile
from gradio_pdf import PDF
from config import config
from model import model_initialized
from pdf_processor import to_pdf, to_markdown
from tts import text_to_speech_openai, text_to_speech_gtts

# Set up logging
logging.basicConfig(level=logging.INFO)

# Load header HTML content
with open("header.html", "r", encoding="utf-8") as file:
    header = file.read()

# Define language options (could also be moved to config.yaml)
latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
              'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
              'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
              'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
                 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
                   'sa', 'bgc']
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']

all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang

# Define a function to convert a file to a PDF (if not already)
def file_to_pdf(file_obj):
    if file_obj is not None:
        return to_pdf(file_obj.name)
    return None

# Define a function to handle TTS using OpenAI (with fallback)
def read_text(text, language="en"):
    """
    Attempts to synthesize speech from text using OpenAI TTS,
    falling back to gTTS if an error occurs.
    """
    try:
        text_to_speech_openai(text, language)
    except Exception as e:
        logging.error("OpenAI TTS failed: %s. Falling back to gTTS.", e)
        text_to_speech_gtts(text, language)
    return "Audio played successfully"

# Set up the Gradio Blocks interface
with gr.Blocks() as demo:
    gr.HTML(header)
    with gr.Row():
        with gr.Column(variant='panel', scale=5):
            file_input = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
            max_pages = gr.Slider(1, 20, config.get("max_pages_default", 10), step=1, label='Max convert pages')
            with gr.Row():
                layout_mode = gr.Dropdown(
                    ["layoutlmv3", "doclayout_yolo"],
                    label="Layout model",
                    value=config.get("layout_model_default", "doclayout_yolo")
                )
                language = gr.Dropdown(
                    all_lang,
                    label="Language",
                    value=config.get("language_default", "auto")
                )
            with gr.Row():
                formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
                is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
                table_enable = gr.Checkbox(label="Enable table recognition", value=True)
            with gr.Row():
                convert_button = gr.Button("Convert")
                clear_button = gr.ClearButton(value="Clear")
            pdf_display = PDF(label='PDF preview', interactive=False, visible=True, height=800)
            with gr.Accordion("Examples:"):
                example_root = os.path.join(os.path.dirname(__file__), "examples")
                examples = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith("pdf")]
                gr.Examples(examples=examples, inputs=file_input)
        with gr.Column(variant='panel', scale=5):
            output_file = gr.File(label="Convert result", interactive=False)
            with gr.Tabs():
                with gr.Tab("Markdown rendering"):
                    md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
                with gr.Tab("Markdown text"):
                    md_text = gr.TextArea(lines=45, show_copy_button=True)
            # TTS components
            read_button = gr.Button("Read Out Loud")
            read_status = gr.Textbox(label="TTS Status")
    
    # Define interactions
    file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
    
    convert_button.click(
        fn=to_markdown,
        inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
        outputs=[md_render, md_text, output_file, pdf_display]
    )
    
    read_button.click(fn=read_text, inputs=[md_text, language], outputs=read_status)
    
    clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])

if __name__ == "__main__":
    demo.launch(ssr_mode=True)