File size: 5,264 Bytes
73683aa
0cc1374
 
73683aa
6b438f3
73683aa
 
6b438f3
 
b07522c
6b438f3
 
 
 
 
 
 
 
b07522c
73683aa
6b438f3
 
 
 
 
 
 
2dbedf0
6b438f3
 
a011e6d
6b438f3
 
a011e6d
73683aa
 
 
 
6b438f3
 
 
 
 
 
73683aa
 
6b438f3
a73ec05
6b438f3
a73ec05
6b438f3
 
 
 
 
 
 
 
 
 
a73ec05
73683aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b438f3
 
 
73683aa
 
a73ec05
73683aa
 
 
 
 
a73ec05
6b438f3
 
 
 
 
 
a73ec05
73683aa
0a681f9
0cc1374
a232e1e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# app.py
import os
import gradio as gr
from gradio_pdf import PDF
import logging
from model import model_initialized
from pdf_processor import to_pdf, to_markdown
from config import config
from tts import text_to_speech  # Import TTS module

# Set up logging with ANSI escape codes for colored output
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

def log_info(message: str):
    logging.info(f"\033[92m{message}\033[0m")  # Green for info

def log_error(message: str):
    logging.error(f"\033[91m{message}\033[0m")  # Red for errors

# Load header HTML content
try:
    with open("header.html", "r") as file:
        header = file.read()
    log_info("Header loaded successfully.")
except Exception as e:
    log_error(f"Failed to load header.html. Error: {e}")
    header = "<h1>Header not found</h1>"

# Define language options
latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', 'sa', 'bgc']
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang

def file_to_pdf(file_obj):
    if file_obj is not None:
        try:
            pdf_path = to_pdf(file_obj.name)
            log_info("File converted to PDF successfully.")
            return pdf_path
        except Exception as e:
            log_error(f"Error converting file to PDF: {e}")
    return None

def generate_audio(text: str) -> str:
    """
    Converts the provided text to speech and returns the path of the audio file.
    """
    if text:
        try:
            audio_file = text_to_speech(text)
            log_info("Audio generated successfully.")
            return audio_file
        except Exception as e:
            log_error(f"Audio generation failed: {e}")
            return ""
    log_error("No text provided for TTS.")
    return ""

with gr.Blocks() as demo:
    gr.HTML(header)
    with gr.Row():
        with gr.Column(variant='panel', scale=5):
            file_input = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
            max_pages = gr.Slider(1, 20, config.get("max_pages_default", 10), step=1, label='Max convert pages')
            with gr.Row():
                layout_mode = gr.Dropdown(
                    ["layoutlmv3", "doclayout_yolo"],
                    label="Layout model",
                    value=config.get("layout_model_default", "doclayout_yolo")
                )
                language = gr.Dropdown(
                    all_lang,
                    label="Language",
                    value=config.get("language_default", "auto")
                )
            with gr.Row():
                formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
                is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
                table_enable = gr.Checkbox(label="Enable table recognition", value=True)
            with gr.Row():
                convert_button = gr.Button("Convert")
                clear_button = gr.ClearButton(value="Clear")
            pdf_display = PDF(label='PDF preview', interactive=False, visible=True, height=800)
            with gr.Accordion("Examples:"):
                example_root = os.path.join(os.path.dirname(__file__), "examples")
                examples = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith("pdf")]
                gr.Examples(examples=examples, inputs=file_input)
        with gr.Column(variant='panel', scale=5):
            output_file = gr.File(label="Convert result", interactive=False)
            with gr.Tabs():
                with gr.Tab("Markdown rendering"):
                    md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
                with gr.Tab("Markdown text"):
                    md_text = gr.TextArea(lines=45, show_copy_button=True)
            # Audio component for TTS playback
            audio_output = gr.Audio(label="Read Aloud", type="filepath")
            read_button = gr.Button("Read Aloud")
    
    file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
    
    convert_button.click(
        fn=to_markdown,
        inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
        outputs=[md_render, md_text, output_file, pdf_display]
    )
    
    # When "Read Aloud" is clicked, generate audio from the markdown text
    read_button.click(
        fn=generate_audio,
        inputs=md_text,
        outputs=audio_output
    )
    
    clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])

if __name__ == "__main__":
    demo.launch(ssr_mode=True)