PDF_reader

Sleeping

App Files Files Community

Echo9k commited on Feb 27

Commit

a73ec05

1 Parent(s): 73683aa

Added TTS

Browse files

Files changed (3) hide show

app.py +37 -8
header.html +124 -115
tts.py +46 -0

app.py CHANGED Viewed

@@ -1,34 +1,55 @@
 # app.py
 import os
 import gradio as gr
 from gradio_pdf import PDF
 from model import model_initialized
 from pdf_processor import to_pdf, to_markdown
-from config import config
-import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 # Load header HTML content
-with open("header.html", "r") as file:
     header = file.read()
-# Language options (you may also move these to config.yaml if preferred)
-latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
 arabic_lang = ['ar', 'fa', 'ug', 'ur']
-cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
-devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', 'sa', 'bgc']
 other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
 all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
-# Utility function to ensure input file is a PDF
 def file_to_pdf(file_obj):
     if file_obj is not None:
         return to_pdf(file_obj.name)
     return None
 with gr.Blocks() as demo:
     gr.HTML(header)
     with gr.Row():
@@ -65,13 +86,21 @@ with gr.Blocks() as demo:
                     md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
                 with gr.Tab("Markdown text"):
                     md_text = gr.TextArea(lines=45, show_copy_button=True)
     file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
     convert_button.click(
         fn=to_markdown,
         inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
         outputs=[md_render, md_text, output_file, pdf_display]
     )
     clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
 if __name__ == "__main__":

 # app.py
 import os
 import gradio as gr
+import logging
+import tempfile
 from gradio_pdf import PDF
+from config import config
 from model import model_initialized
 from pdf_processor import to_pdf, to_markdown
+from tts import text_to_speech_openai, text_to_speech_gtts
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 # Load header HTML content
+with open("header.html", "r", encoding="utf-8") as file:
     header = file.read()
+# Define language options (could also be moved to config.yaml)
+latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
+              'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
+              'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
+              'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
 arabic_lang = ['ar', 'fa', 'ug', 'ur']
+cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
+                 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
+devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
+                   'sa', 'bgc']
 other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
 all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
+# Define a function to convert a file to a PDF (if not already)
 def file_to_pdf(file_obj):
     if file_obj is not None:
         return to_pdf(file_obj.name)
     return None
+# Define a function to handle TTS using OpenAI (with fallback)
+def read_text(text, language="en"):
+    """
+    Attempts to synthesize speech from text using OpenAI TTS,
+    falling back to gTTS if an error occurs.
+    """
+    try:
+        text_to_speech_openai(text, language)
+    except Exception as e:
+        logging.error("OpenAI TTS failed: %s. Falling back to gTTS.", e)
+        text_to_speech_gtts(text, language)
+    return "Audio played successfully"
+# Set up the Gradio Blocks interface
 with gr.Blocks() as demo:
     gr.HTML(header)
     with gr.Row():
                     md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
                 with gr.Tab("Markdown text"):
                     md_text = gr.TextArea(lines=45, show_copy_button=True)
+            # TTS components
+            read_button = gr.Button("Read Out Loud")
+            read_status = gr.Textbox(label="TTS Status")
+    # Define interactions
     file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
     convert_button.click(
         fn=to_markdown,
         inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
         outputs=[md_render, md_text, output_file, pdf_display]
     )
+    read_button.click(fn=read_text, inputs=[md_text, language], outputs=read_status)
     clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
 if __name__ == "__main__":

header.html CHANGED Viewed

@@ -1,132 +1,141 @@
 <html>
   <head>
-  <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
-  <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
-<style>
-  .link-block {
-    border: 1px solid transparent;
-    border-radius: 24px;
-    background-color: rgba(54, 54, 54, 1);
-    cursor: pointer !important;
-  }
-  .link-block:hover {
-    background-color: rgba(54, 54, 54, 0.75) !important;
-    cursor: pointer !important;
-  }
-  .external-link {
-    display: inline-flex;
-    align-items: center;
-    height: 36px;
-    line-height: 36px;
-    padding: 0 16px;
-    cursor: pointer !important;
-  }
-  .external-link,
-  .external-link:hover {
-    cursor: pointer !important;
-  }
-  a {
-    text-decoration: none;
-  }
-</style>
   </head>
-<body>
-  <div style="
-      display: flex;
-      flex-direction: column;
-      justify-content: center;
-      align-items: center;
-      text-align: center;
-      background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
-      padding: 24px;
-      gap: 24px;
-      border-radius: 8px;
-    ">
     <div style="
         display: flex;
         flex-direction: column;
         align-items: center;
-        gap: 16px;
       ">
-      <div style="display: flex; flex-direction: column; gap: 8px">
-        <h1 style="
-            font-size: 48px;
-            color: #fafafa;
-            margin: 0;
-            font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
-              'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
-          ">
-          MinerU: PDF Extraction Demo
-        </h1>
       </div>
-    </div>
-    <p style="
-        margin: 0;
-        line-height: 1.6rem;
-        font-size: 16px;
-        color: #fafafa;
-        opacity: 0.8;
-      ">
-      A one-stop, open-source, high-quality data extraction tool, supports
-      PDF/webpage/e-book extraction.<br>
-    </p>
-    <style>
-      .link-block {
-        display: inline-block;
-      }
-      .link-block + .link-block {
-        margin-left: 20px;
-      }
-    </style>
-    <div class="column has-text-centered">
-      <div class="publication-links">
-        <!-- Code Link. -->
-        <span class="link-block">
-          <a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
-            <span class="icon" style="margin-right: 4px">
-              <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
-            </span>
-            <span style="color: white">Code</span>
-          </a>
-        </span>
-        <!-- arXiv Link. -->
-        <span class="link-block">
-          <a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
-            <span class="icon" style="margin-right: 8px">
-              <i class="fas fa-file" style="color: white"></i>
-            </span>
-            <span style="color: white">Paper</span>
-          </a>
-        </span>
-        <!-- Homepage Link. -->
-        <span class="link-block">
-          <a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
-            <span class="icon" style="margin-right: 8px">
-              <i class="fas fa-home" style="color: white"></i>
-            </span>
-            <span style="color: white">Homepage</span>
-          </a>
-        </span>
-        <!-- Client Link. -->
-        <span class="link-block">
-          <a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
-            <span class="icon" style="margin-right: 8px">
-              <i class="fas fa-download" style="color: white"></i>
-            </span>
-            <span style="color: white">Download</span>
-          </a>
-        </span>
-      </div>
-    </div>
-    <!-- New Demo Links -->
-  </div>
-</body></html>

 <html>
   <head>
+    <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
+    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
+    <style>
+      .link-block {
+        border: 1px solid transparent;
+        border-radius: 24px;
+        background-color: rgba(54, 54, 54, 1);
+        cursor: pointer !important;
+      }
+      .link-block:hover {
+        background-color: rgba(54, 54, 54, 0.75) !important;
+        cursor: pointer !important;
+      }
+      .external-link {
+        display: inline-flex;
+        align-items: center;
+        height: 36px;
+        line-height: 36px;
+        padding: 0 16px;
+        cursor: pointer !important;
+      }
+      .external-link,
+      .external-link:hover {
+        cursor: pointer !important;
+      }
+      a {
+        text-decoration: none;
+      }
+    </style>
   </head>
+  <body>
     <div style="
         display: flex;
         flex-direction: column;
+        justify-content: center;
         align-items: center;
+        text-align: center;
+        background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
+        padding: 24px;
+        gap: 24px;
+        border-radius: 8px;
       ">
+      <div style="
+          display: flex;
+          flex-direction: column;
+          align-items: center;
+          gap: 16px;
+        ">
+        <div style="display: flex; flex-direction: column; gap: 8px">
+          <h1 style="
+              font-size: 48px;
+              color: #fafafa;
+              margin: 0;
+              font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
+                'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
+            ">
+            MinerU: PDF Extraction &amp; Voice Reading Demo
+          </h1>
+        </div>
       </div>
+      <p style="
+          margin: 0;
+          line-height: 1.6rem;
+          font-size: 16px;
+          color: #fafafa;
+          opacity: 0.8;
+        ">
+        A one-stop, open-source, high-quality tool for data extraction and PDF voice reading,<br>
+        supporting PDF, webpage, and e-book extraction.
+      </p>
+      <style>
+        .link-block {
+          display: inline-block;
+        }
+        .link-block + .link-block {
+          margin-left: 20px;
+        }
+      </style>
+      <div class="column has-text-centered">
+        <div class="publication-links">
+          <!-- Code Link. -->
+          <span class="link-block">
+            <a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+              <span class="icon" style="margin-right: 4px">
+                <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
+              </span>
+              <span style="color: white">Code</span>
+            </a>
+          </span>
+          <!-- arXiv Link. -->
+          <span class="link-block">
+            <a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+              <span class="icon" style="margin-right: 8px">
+                <i class="fas fa-file" style="color: white"></i>
+              </span>
+              <span style="color: white">Paper</span>
+            </a>
+          </span>
+          <!-- Homepage Link. -->
+          <span class="link-block">
+            <a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+              <span class="icon" style="margin-right: 8px">
+                <i class="fas fa-home" style="color: white"></i>
+              </span>
+              <span style="color: white">Homepage</span>
+            </a>
+          </span>
+          <!-- Client Link. -->
+          <span class="link-block">
+            <a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+              <span class="icon" style="margin-right: 8px">
+                <i class="fas fa-download" style="color: white"></i>
+              </span>
+              <span style="color: white">Download</span>
+            </a>
+          </span>
+          <!-- Voice Reading Demo Link. -->
+          <span class="link-block">
+            <a href="https://mineru.org.cn/voice?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+              <span class="icon" style="margin-right: 8px">
+                <i class="fas fa-volume-up" style="color: white"></i>
+              </span>
+              <span style="color: white">Voice Reading Demo</span>
+            </a>
+          </span>
+        </div>
+      </div>
+      <!-- New Demo Links -->
+    </div>
+  </body>
+</html>

tts.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# tts.py
+import os
+import tempfile
+import requests
+from playsound import playsound
+def text_to_speech_openai(text, language="en"):
+    """
+    Convert text to speech using a hypothetical OpenAI TTS API.
+    Note: OpenAI Whisper is for speech recognition.
+    Replace the endpoint and parameters with actual API details when available.
+    """
+    import openai
+    api_key = os.getenv("api_key_oai")
+    if not api_key:
+        raise ValueError("API key for OpenAI TTS not found in environment variable 'api_key_oai'")
+    openai.api_key = api_key
+    try:
+        # Hypothetical API call -- adjust the engine name and parameters as per actual API documentation.
+        response = openai.Audio.synthesize(
+            engine="tts",      # Hypothetical engine name for TTS
+            text=text,
+            language=language
+        )
+        audio_url = response["audio_url"]
+    except Exception as e:
+        raise RuntimeError(f"OpenAI TTS synthesis failed: {e}")
+    # Download and play the audio
+    audio_data = requests.get(audio_url).content
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_file.write(audio_data)
+        tmp_file_path = tmp_file.name
+    playsound(tmp_file_path)
+def text_to_speech_gtts(text, language="en"):
+    """
+    Fallback text-to-speech using the gTTS library.
+    """
+    from gtts import gTTS
+    tts = gTTS(text=text, lang=language)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tts.save(tmp_file.name)
+        tmp_file_path = tmp_file.name
+    playsound(tmp_file_path)