Spaces:
Sleeping
Sleeping
Added TTS
Browse files- app.py +37 -8
- header.html +124 -115
- tts.py +46 -0
app.py
CHANGED
@@ -1,34 +1,55 @@
|
|
1 |
# app.py
|
2 |
import os
|
3 |
import gradio as gr
|
|
|
|
|
4 |
from gradio_pdf import PDF
|
|
|
5 |
from model import model_initialized
|
6 |
from pdf_processor import to_pdf, to_markdown
|
7 |
-
from
|
8 |
-
import logging
|
9 |
|
10 |
# Set up logging
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
|
13 |
# Load header HTML content
|
14 |
-
with open("header.html", "r") as file:
|
15 |
header = file.read()
|
16 |
|
17 |
-
#
|
18 |
-
latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
|
|
|
|
|
|
|
19 |
arabic_lang = ['ar', 'fa', 'ug', 'ur']
|
20 |
-
cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
|
21 |
-
|
|
|
|
|
22 |
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
23 |
|
24 |
all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
|
25 |
|
26 |
-
#
|
27 |
def file_to_pdf(file_obj):
|
28 |
if file_obj is not None:
|
29 |
return to_pdf(file_obj.name)
|
30 |
return None
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
with gr.Blocks() as demo:
|
33 |
gr.HTML(header)
|
34 |
with gr.Row():
|
@@ -65,13 +86,21 @@ with gr.Blocks() as demo:
|
|
65 |
md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
|
66 |
with gr.Tab("Markdown text"):
|
67 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
|
|
|
|
|
|
68 |
|
|
|
69 |
file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
|
|
|
70 |
convert_button.click(
|
71 |
fn=to_markdown,
|
72 |
inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
|
73 |
outputs=[md_render, md_text, output_file, pdf_display]
|
74 |
)
|
|
|
|
|
|
|
75 |
clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
|
76 |
|
77 |
if __name__ == "__main__":
|
|
|
1 |
# app.py
|
2 |
import os
|
3 |
import gradio as gr
|
4 |
+
import logging
|
5 |
+
import tempfile
|
6 |
from gradio_pdf import PDF
|
7 |
+
from config import config
|
8 |
from model import model_initialized
|
9 |
from pdf_processor import to_pdf, to_markdown
|
10 |
+
from tts import text_to_speech_openai, text_to_speech_gtts
|
|
|
11 |
|
12 |
# Set up logging
|
13 |
logging.basicConfig(level=logging.INFO)
|
14 |
|
15 |
# Load header HTML content
|
16 |
+
with open("header.html", "r", encoding="utf-8") as file:
|
17 |
header = file.read()
|
18 |
|
19 |
+
# Define language options (could also be moved to config.yaml)
|
20 |
+
latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
|
21 |
+
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
|
22 |
+
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
|
23 |
+
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
|
24 |
arabic_lang = ['ar', 'fa', 'ug', 'ur']
|
25 |
+
cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
|
26 |
+
'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
|
27 |
+
devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
|
28 |
+
'sa', 'bgc']
|
29 |
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
30 |
|
31 |
all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
|
32 |
|
33 |
+
# Define a function to convert a file to a PDF (if not already)
|
34 |
def file_to_pdf(file_obj):
|
35 |
if file_obj is not None:
|
36 |
return to_pdf(file_obj.name)
|
37 |
return None
|
38 |
|
39 |
+
# Define a function to handle TTS using OpenAI (with fallback)
|
40 |
+
def read_text(text, language="en"):
|
41 |
+
"""
|
42 |
+
Attempts to synthesize speech from text using OpenAI TTS,
|
43 |
+
falling back to gTTS if an error occurs.
|
44 |
+
"""
|
45 |
+
try:
|
46 |
+
text_to_speech_openai(text, language)
|
47 |
+
except Exception as e:
|
48 |
+
logging.error("OpenAI TTS failed: %s. Falling back to gTTS.", e)
|
49 |
+
text_to_speech_gtts(text, language)
|
50 |
+
return "Audio played successfully"
|
51 |
+
|
52 |
+
# Set up the Gradio Blocks interface
|
53 |
with gr.Blocks() as demo:
|
54 |
gr.HTML(header)
|
55 |
with gr.Row():
|
|
|
86 |
md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
|
87 |
with gr.Tab("Markdown text"):
|
88 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
89 |
+
# TTS components
|
90 |
+
read_button = gr.Button("Read Out Loud")
|
91 |
+
read_status = gr.Textbox(label="TTS Status")
|
92 |
|
93 |
+
# Define interactions
|
94 |
file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
|
95 |
+
|
96 |
convert_button.click(
|
97 |
fn=to_markdown,
|
98 |
inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
|
99 |
outputs=[md_render, md_text, output_file, pdf_display]
|
100 |
)
|
101 |
+
|
102 |
+
read_button.click(fn=read_text, inputs=[md_text, language], outputs=read_status)
|
103 |
+
|
104 |
clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
|
105 |
|
106 |
if __name__ == "__main__":
|
header.html
CHANGED
@@ -1,132 +1,141 @@
|
|
1 |
<html>
|
2 |
<head>
|
3 |
-
|
4 |
-
|
5 |
-
<style>
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
</style>
|
32 |
</head>
|
33 |
|
34 |
-
<body>
|
35 |
-
<div style="
|
36 |
-
display: flex;
|
37 |
-
flex-direction: column;
|
38 |
-
justify-content: center;
|
39 |
-
align-items: center;
|
40 |
-
text-align: center;
|
41 |
-
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
42 |
-
padding: 24px;
|
43 |
-
gap: 24px;
|
44 |
-
border-radius: 8px;
|
45 |
-
">
|
46 |
<div style="
|
47 |
display: flex;
|
48 |
flex-direction: column;
|
|
|
49 |
align-items: center;
|
50 |
-
|
|
|
|
|
|
|
|
|
51 |
">
|
52 |
-
<div style="
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
"
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
</div>
|
63 |
-
</div>
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
</div>
|
126 |
-
</div>
|
127 |
-
|
128 |
-
<!-- New Demo Links -->
|
129 |
-
</div>
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
-
|
|
|
|
|
|
|
|
1 |
<html>
|
2 |
<head>
|
3 |
+
<!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
|
4 |
+
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
|
5 |
+
<style>
|
6 |
+
.link-block {
|
7 |
+
border: 1px solid transparent;
|
8 |
+
border-radius: 24px;
|
9 |
+
background-color: rgba(54, 54, 54, 1);
|
10 |
+
cursor: pointer !important;
|
11 |
+
}
|
12 |
+
.link-block:hover {
|
13 |
+
background-color: rgba(54, 54, 54, 0.75) !important;
|
14 |
+
cursor: pointer !important;
|
15 |
+
}
|
16 |
+
.external-link {
|
17 |
+
display: inline-flex;
|
18 |
+
align-items: center;
|
19 |
+
height: 36px;
|
20 |
+
line-height: 36px;
|
21 |
+
padding: 0 16px;
|
22 |
+
cursor: pointer !important;
|
23 |
+
}
|
24 |
+
.external-link,
|
25 |
+
.external-link:hover {
|
26 |
+
cursor: pointer !important;
|
27 |
+
}
|
28 |
+
a {
|
29 |
+
text-decoration: none;
|
30 |
+
}
|
31 |
+
</style>
|
32 |
</head>
|
33 |
|
34 |
+
<body>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
<div style="
|
36 |
display: flex;
|
37 |
flex-direction: column;
|
38 |
+
justify-content: center;
|
39 |
align-items: center;
|
40 |
+
text-align: center;
|
41 |
+
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
42 |
+
padding: 24px;
|
43 |
+
gap: 24px;
|
44 |
+
border-radius: 8px;
|
45 |
">
|
46 |
+
<div style="
|
47 |
+
display: flex;
|
48 |
+
flex-direction: column;
|
49 |
+
align-items: center;
|
50 |
+
gap: 16px;
|
51 |
+
">
|
52 |
+
<div style="display: flex; flex-direction: column; gap: 8px">
|
53 |
+
<h1 style="
|
54 |
+
font-size: 48px;
|
55 |
+
color: #fafafa;
|
56 |
+
margin: 0;
|
57 |
+
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
|
58 |
+
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
|
59 |
+
">
|
60 |
+
MinerU: PDF Extraction & Voice Reading Demo
|
61 |
+
</h1>
|
62 |
+
</div>
|
63 |
</div>
|
|
|
64 |
|
65 |
+
<p style="
|
66 |
+
margin: 0;
|
67 |
+
line-height: 1.6rem;
|
68 |
+
font-size: 16px;
|
69 |
+
color: #fafafa;
|
70 |
+
opacity: 0.8;
|
71 |
+
">
|
72 |
+
A one-stop, open-source, high-quality tool for data extraction and PDF voice reading,<br>
|
73 |
+
supporting PDF, webpage, and e-book extraction.
|
74 |
+
</p>
|
75 |
+
<style>
|
76 |
+
.link-block {
|
77 |
+
display: inline-block;
|
78 |
+
}
|
79 |
+
.link-block + .link-block {
|
80 |
+
margin-left: 20px;
|
81 |
+
}
|
82 |
+
</style>
|
83 |
|
84 |
+
<div class="column has-text-centered">
|
85 |
+
<div class="publication-links">
|
86 |
+
<!-- Code Link. -->
|
87 |
+
<span class="link-block">
|
88 |
+
<a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
89 |
+
<span class="icon" style="margin-right: 4px">
|
90 |
+
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
|
91 |
+
</span>
|
92 |
+
<span style="color: white">Code</span>
|
93 |
+
</a>
|
94 |
+
</span>
|
95 |
|
96 |
+
<!-- arXiv Link. -->
|
97 |
+
<span class="link-block">
|
98 |
+
<a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
99 |
+
<span class="icon" style="margin-right: 8px">
|
100 |
+
<i class="fas fa-file" style="color: white"></i>
|
101 |
+
</span>
|
102 |
+
<span style="color: white">Paper</span>
|
103 |
+
</a>
|
104 |
+
</span>
|
105 |
|
106 |
+
<!-- Homepage Link. -->
|
107 |
+
<span class="link-block">
|
108 |
+
<a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
109 |
+
<span class="icon" style="margin-right: 8px">
|
110 |
+
<i class="fas fa-home" style="color: white"></i>
|
111 |
+
</span>
|
112 |
+
<span style="color: white">Homepage</span>
|
113 |
+
</a>
|
114 |
+
</span>
|
115 |
|
116 |
+
<!-- Client Link. -->
|
117 |
+
<span class="link-block">
|
118 |
+
<a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
119 |
+
<span class="icon" style="margin-right: 8px">
|
120 |
+
<i class="fas fa-download" style="color: white"></i>
|
121 |
+
</span>
|
122 |
+
<span style="color: white">Download</span>
|
123 |
+
</a>
|
124 |
+
</span>
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
+
<!-- Voice Reading Demo Link. -->
|
127 |
+
<span class="link-block">
|
128 |
+
<a href="https://mineru.org.cn/voice?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
129 |
+
<span class="icon" style="margin-right: 8px">
|
130 |
+
<i class="fas fa-volume-up" style="color: white"></i>
|
131 |
+
</span>
|
132 |
+
<span style="color: white">Voice Reading Demo</span>
|
133 |
+
</a>
|
134 |
+
</span>
|
135 |
+
</div>
|
136 |
+
</div>
|
137 |
|
138 |
+
<!-- New Demo Links -->
|
139 |
+
</div>
|
140 |
+
</body>
|
141 |
+
</html>
|
tts.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# tts.py
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import requests
|
5 |
+
from playsound import playsound
|
6 |
+
|
7 |
+
def text_to_speech_openai(text, language="en"):
|
8 |
+
"""
|
9 |
+
Convert text to speech using a hypothetical OpenAI TTS API.
|
10 |
+
Note: OpenAI Whisper is for speech recognition.
|
11 |
+
Replace the endpoint and parameters with actual API details when available.
|
12 |
+
"""
|
13 |
+
import openai
|
14 |
+
api_key = os.getenv("api_key_oai")
|
15 |
+
if not api_key:
|
16 |
+
raise ValueError("API key for OpenAI TTS not found in environment variable 'api_key_oai'")
|
17 |
+
openai.api_key = api_key
|
18 |
+
|
19 |
+
try:
|
20 |
+
# Hypothetical API call -- adjust the engine name and parameters as per actual API documentation.
|
21 |
+
response = openai.Audio.synthesize(
|
22 |
+
engine="tts", # Hypothetical engine name for TTS
|
23 |
+
text=text,
|
24 |
+
language=language
|
25 |
+
)
|
26 |
+
audio_url = response["audio_url"]
|
27 |
+
except Exception as e:
|
28 |
+
raise RuntimeError(f"OpenAI TTS synthesis failed: {e}")
|
29 |
+
|
30 |
+
# Download and play the audio
|
31 |
+
audio_data = requests.get(audio_url).content
|
32 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
33 |
+
tmp_file.write(audio_data)
|
34 |
+
tmp_file_path = tmp_file.name
|
35 |
+
playsound(tmp_file_path)
|
36 |
+
|
37 |
+
def text_to_speech_gtts(text, language="en"):
|
38 |
+
"""
|
39 |
+
Fallback text-to-speech using the gTTS library.
|
40 |
+
"""
|
41 |
+
from gtts import gTTS
|
42 |
+
tts = gTTS(text=text, lang=language)
|
43 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
44 |
+
tts.save(tmp_file.name)
|
45 |
+
tmp_file_path = tmp_file.name
|
46 |
+
playsound(tmp_file_path)
|