Echo9k commited on
Commit
a73ec05
·
1 Parent(s): 73683aa
Files changed (3) hide show
  1. app.py +37 -8
  2. header.html +124 -115
  3. tts.py +46 -0
app.py CHANGED
@@ -1,34 +1,55 @@
1
  # app.py
2
  import os
3
  import gradio as gr
 
 
4
  from gradio_pdf import PDF
 
5
  from model import model_initialized
6
  from pdf_processor import to_pdf, to_markdown
7
- from config import config
8
- import logging
9
 
10
  # Set up logging
11
  logging.basicConfig(level=logging.INFO)
12
 
13
  # Load header HTML content
14
- with open("header.html", "r") as file:
15
  header = file.read()
16
 
17
- # Language options (you may also move these to config.yaml if preferred)
18
- latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
 
 
 
19
  arabic_lang = ['ar', 'fa', 'ug', 'ur']
20
- cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
21
- devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', 'sa', 'bgc']
 
 
22
  other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
23
 
24
  all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
25
 
26
- # Utility function to ensure input file is a PDF
27
  def file_to_pdf(file_obj):
28
  if file_obj is not None:
29
  return to_pdf(file_obj.name)
30
  return None
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  with gr.Blocks() as demo:
33
  gr.HTML(header)
34
  with gr.Row():
@@ -65,13 +86,21 @@ with gr.Blocks() as demo:
65
  md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
66
  with gr.Tab("Markdown text"):
67
  md_text = gr.TextArea(lines=45, show_copy_button=True)
 
 
 
68
 
 
69
  file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
 
70
  convert_button.click(
71
  fn=to_markdown,
72
  inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
73
  outputs=[md_render, md_text, output_file, pdf_display]
74
  )
 
 
 
75
  clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
76
 
77
  if __name__ == "__main__":
 
1
  # app.py
2
  import os
3
  import gradio as gr
4
+ import logging
5
+ import tempfile
6
  from gradio_pdf import PDF
7
+ from config import config
8
  from model import model_initialized
9
  from pdf_processor import to_pdf, to_markdown
10
+ from tts import text_to_speech_openai, text_to_speech_gtts
 
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO)
14
 
15
  # Load header HTML content
16
+ with open("header.html", "r", encoding="utf-8") as file:
17
  header = file.read()
18
 
19
+ # Define language options (could also be moved to config.yaml)
20
+ latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
21
+ 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
22
+ 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
23
+ 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
24
  arabic_lang = ['ar', 'fa', 'ug', 'ur']
25
+ cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
26
+ 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
27
+ devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
28
+ 'sa', 'bgc']
29
  other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
30
 
31
  all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
32
 
33
+ # Define a function to convert a file to a PDF (if not already)
34
  def file_to_pdf(file_obj):
35
  if file_obj is not None:
36
  return to_pdf(file_obj.name)
37
  return None
38
 
39
+ # Define a function to handle TTS using OpenAI (with fallback)
40
+ def read_text(text, language="en"):
41
+ """
42
+ Attempts to synthesize speech from text using OpenAI TTS,
43
+ falling back to gTTS if an error occurs.
44
+ """
45
+ try:
46
+ text_to_speech_openai(text, language)
47
+ except Exception as e:
48
+ logging.error("OpenAI TTS failed: %s. Falling back to gTTS.", e)
49
+ text_to_speech_gtts(text, language)
50
+ return "Audio played successfully"
51
+
52
+ # Set up the Gradio Blocks interface
53
  with gr.Blocks() as demo:
54
  gr.HTML(header)
55
  with gr.Row():
 
86
  md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
87
  with gr.Tab("Markdown text"):
88
  md_text = gr.TextArea(lines=45, show_copy_button=True)
89
+ # TTS components
90
+ read_button = gr.Button("Read Out Loud")
91
+ read_status = gr.Textbox(label="TTS Status")
92
 
93
+ # Define interactions
94
  file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
95
+
96
  convert_button.click(
97
  fn=to_markdown,
98
  inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
99
  outputs=[md_render, md_text, output_file, pdf_display]
100
  )
101
+
102
+ read_button.click(fn=read_text, inputs=[md_text, language], outputs=read_status)
103
+
104
  clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
105
 
106
  if __name__ == "__main__":
header.html CHANGED
@@ -1,132 +1,141 @@
1
  <html>
2
  <head>
3
- <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
4
- <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
5
- <style>
6
- .link-block {
7
- border: 1px solid transparent;
8
- border-radius: 24px;
9
- background-color: rgba(54, 54, 54, 1);
10
- cursor: pointer !important;
11
- }
12
- .link-block:hover {
13
- background-color: rgba(54, 54, 54, 0.75) !important;
14
- cursor: pointer !important;
15
- }
16
- .external-link {
17
- display: inline-flex;
18
- align-items: center;
19
- height: 36px;
20
- line-height: 36px;
21
- padding: 0 16px;
22
- cursor: pointer !important;
23
- }
24
- .external-link,
25
- .external-link:hover {
26
- cursor: pointer !important;
27
- }
28
- a {
29
- text-decoration: none;
30
- }
31
- </style>
32
  </head>
33
 
34
- <body>
35
- <div style="
36
- display: flex;
37
- flex-direction: column;
38
- justify-content: center;
39
- align-items: center;
40
- text-align: center;
41
- background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
42
- padding: 24px;
43
- gap: 24px;
44
- border-radius: 8px;
45
- ">
46
  <div style="
47
  display: flex;
48
  flex-direction: column;
 
49
  align-items: center;
50
- gap: 16px;
 
 
 
 
51
  ">
52
- <div style="display: flex; flex-direction: column; gap: 8px">
53
- <h1 style="
54
- font-size: 48px;
55
- color: #fafafa;
56
- margin: 0;
57
- font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
58
- 'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
59
- ">
60
- MinerU: PDF Extraction Demo
61
- </h1>
 
 
 
 
 
 
 
62
  </div>
63
- </div>
64
 
65
- <p style="
66
- margin: 0;
67
- line-height: 1.6rem;
68
- font-size: 16px;
69
- color: #fafafa;
70
- opacity: 0.8;
71
- ">
72
- A one-stop, open-source, high-quality data extraction tool, supports
73
- PDF/webpage/e-book extraction.<br>
74
- </p>
75
- <style>
76
- .link-block {
77
- display: inline-block;
78
- }
79
- .link-block + .link-block {
80
- margin-left: 20px;
81
- }
82
- </style>
83
 
84
- <div class="column has-text-centered">
85
- <div class="publication-links">
86
- <!-- Code Link. -->
87
- <span class="link-block">
88
- <a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
89
- <span class="icon" style="margin-right: 4px">
90
- <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
91
- </span>
92
- <span style="color: white">Code</span>
93
- </a>
94
- </span>
95
 
96
- <!-- arXiv Link. -->
97
- <span class="link-block">
98
- <a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
99
- <span class="icon" style="margin-right: 8px">
100
- <i class="fas fa-file" style="color: white"></i>
101
- </span>
102
- <span style="color: white">Paper</span>
103
- </a>
104
- </span>
105
 
106
- <!-- Homepage Link. -->
107
- <span class="link-block">
108
- <a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
109
- <span class="icon" style="margin-right: 8px">
110
- <i class="fas fa-home" style="color: white"></i>
111
- </span>
112
- <span style="color: white">Homepage</span>
113
- </a>
114
- </span>
115
 
116
- <!-- Client Link. -->
117
- <span class="link-block">
118
- <a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
119
- <span class="icon" style="margin-right: 8px">
120
- <i class="fas fa-download" style="color: white"></i>
121
- </span>
122
- <span style="color: white">Download</span>
123
- </a>
124
- </span>
125
- </div>
126
- </div>
127
-
128
- <!-- New Demo Links -->
129
- </div>
130
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- </body></html>
 
 
 
 
1
  <html>
2
  <head>
3
+ <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
4
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
5
+ <style>
6
+ .link-block {
7
+ border: 1px solid transparent;
8
+ border-radius: 24px;
9
+ background-color: rgba(54, 54, 54, 1);
10
+ cursor: pointer !important;
11
+ }
12
+ .link-block:hover {
13
+ background-color: rgba(54, 54, 54, 0.75) !important;
14
+ cursor: pointer !important;
15
+ }
16
+ .external-link {
17
+ display: inline-flex;
18
+ align-items: center;
19
+ height: 36px;
20
+ line-height: 36px;
21
+ padding: 0 16px;
22
+ cursor: pointer !important;
23
+ }
24
+ .external-link,
25
+ .external-link:hover {
26
+ cursor: pointer !important;
27
+ }
28
+ a {
29
+ text-decoration: none;
30
+ }
31
+ </style>
32
  </head>
33
 
34
+ <body>
 
 
 
 
 
 
 
 
 
 
 
35
  <div style="
36
  display: flex;
37
  flex-direction: column;
38
+ justify-content: center;
39
  align-items: center;
40
+ text-align: center;
41
+ background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
42
+ padding: 24px;
43
+ gap: 24px;
44
+ border-radius: 8px;
45
  ">
46
+ <div style="
47
+ display: flex;
48
+ flex-direction: column;
49
+ align-items: center;
50
+ gap: 16px;
51
+ ">
52
+ <div style="display: flex; flex-direction: column; gap: 8px">
53
+ <h1 style="
54
+ font-size: 48px;
55
+ color: #fafafa;
56
+ margin: 0;
57
+ font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
58
+ 'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
59
+ ">
60
+ MinerU: PDF Extraction &amp; Voice Reading Demo
61
+ </h1>
62
+ </div>
63
  </div>
 
64
 
65
+ <p style="
66
+ margin: 0;
67
+ line-height: 1.6rem;
68
+ font-size: 16px;
69
+ color: #fafafa;
70
+ opacity: 0.8;
71
+ ">
72
+ A one-stop, open-source, high-quality tool for data extraction and PDF voice reading,<br>
73
+ supporting PDF, webpage, and e-book extraction.
74
+ </p>
75
+ <style>
76
+ .link-block {
77
+ display: inline-block;
78
+ }
79
+ .link-block + .link-block {
80
+ margin-left: 20px;
81
+ }
82
+ </style>
83
 
84
+ <div class="column has-text-centered">
85
+ <div class="publication-links">
86
+ <!-- Code Link. -->
87
+ <span class="link-block">
88
+ <a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
89
+ <span class="icon" style="margin-right: 4px">
90
+ <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
91
+ </span>
92
+ <span style="color: white">Code</span>
93
+ </a>
94
+ </span>
95
 
96
+ <!-- arXiv Link. -->
97
+ <span class="link-block">
98
+ <a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
99
+ <span class="icon" style="margin-right: 8px">
100
+ <i class="fas fa-file" style="color: white"></i>
101
+ </span>
102
+ <span style="color: white">Paper</span>
103
+ </a>
104
+ </span>
105
 
106
+ <!-- Homepage Link. -->
107
+ <span class="link-block">
108
+ <a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
109
+ <span class="icon" style="margin-right: 8px">
110
+ <i class="fas fa-home" style="color: white"></i>
111
+ </span>
112
+ <span style="color: white">Homepage</span>
113
+ </a>
114
+ </span>
115
 
116
+ <!-- Client Link. -->
117
+ <span class="link-block">
118
+ <a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
119
+ <span class="icon" style="margin-right: 8px">
120
+ <i class="fas fa-download" style="color: white"></i>
121
+ </span>
122
+ <span style="color: white">Download</span>
123
+ </a>
124
+ </span>
 
 
 
 
 
125
 
126
+ <!-- Voice Reading Demo Link. -->
127
+ <span class="link-block">
128
+ <a href="https://mineru.org.cn/voice?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
129
+ <span class="icon" style="margin-right: 8px">
130
+ <i class="fas fa-volume-up" style="color: white"></i>
131
+ </span>
132
+ <span style="color: white">Voice Reading Demo</span>
133
+ </a>
134
+ </span>
135
+ </div>
136
+ </div>
137
 
138
+ <!-- New Demo Links -->
139
+ </div>
140
+ </body>
141
+ </html>
tts.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tts.py
2
+ import os
3
+ import tempfile
4
+ import requests
5
+ from playsound import playsound
6
+
7
+ def text_to_speech_openai(text, language="en"):
8
+ """
9
+ Convert text to speech using a hypothetical OpenAI TTS API.
10
+ Note: OpenAI Whisper is for speech recognition.
11
+ Replace the endpoint and parameters with actual API details when available.
12
+ """
13
+ import openai
14
+ api_key = os.getenv("api_key_oai")
15
+ if not api_key:
16
+ raise ValueError("API key for OpenAI TTS not found in environment variable 'api_key_oai'")
17
+ openai.api_key = api_key
18
+
19
+ try:
20
+ # Hypothetical API call -- adjust the engine name and parameters as per actual API documentation.
21
+ response = openai.Audio.synthesize(
22
+ engine="tts", # Hypothetical engine name for TTS
23
+ text=text,
24
+ language=language
25
+ )
26
+ audio_url = response["audio_url"]
27
+ except Exception as e:
28
+ raise RuntimeError(f"OpenAI TTS synthesis failed: {e}")
29
+
30
+ # Download and play the audio
31
+ audio_data = requests.get(audio_url).content
32
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
33
+ tmp_file.write(audio_data)
34
+ tmp_file_path = tmp_file.name
35
+ playsound(tmp_file_path)
36
+
37
+ def text_to_speech_gtts(text, language="en"):
38
+ """
39
+ Fallback text-to-speech using the gTTS library.
40
+ """
41
+ from gtts import gTTS
42
+ tts = gTTS(text=text, lang=language)
43
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
44
+ tts.save(tmp_file.name)
45
+ tmp_file_path = tmp_file.name
46
+ playsound(tmp_file_path)