Spaces:

MohamedRashad
/

Arabic-Nougat

Running on Zero

App Files Files Community

Mohamed Rashad commited on Nov 2, 2024

Commit

92871c6

1 Parent(s): 0a285ea

chore: Update app.py with GPU support for text extraction and image processing functionality

Browse files

Files changed (6) hide show

app.py +134 -67
book_page.jpeg → book_page1.jpeg +0 -0
book_page2.jpeg +0 -0
book_page3.jpeg +0 -0
book_page4.jpeg +0 -0
book_page5.jpeg +0 -0

app.py CHANGED Viewed

@@ -1,98 +1,165 @@
-from transformers import NougatProcessor, VisionEncoderDecoderModel
 import gradio as gr
 import torch
-from PIL import Image
 from pathlib import Path
 from pdf2image import convert_from_path
 import spaces
-# Load the model and processor
-processor = NougatProcessor.from_pretrained("MohamedRashad/arabic-small-nougat")
-model = VisionEncoderDecoderModel.from_pretrained("MohamedRashad/arabic-small-nougat")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-print(f"Using {device} device")
-context_length = 2048
 @spaces.GPU
-def extract_text_from_image(image):
-    """
-    Extract text from PIL image
-    Args:
-    image (PIL.Image): Input image
-    Returns:
-    str: Extracted text from the image
-    """
-    # prepare PDF image for the model
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-    # generate transcription
-    outputs = model.generate(
-        pixel_values.to(device),
-        min_length=1,
-        max_new_tokens=context_length,
-        bad_words_ids=[[processor.tokenizer.unk_token_id]],
     )
-    page_sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-    page_sequence = processor.post_process_generation(page_sequence, fix_markdown=False)
-    return page_sequence
-@spaces.GPU(duration=120)
-def extract_text_from_pdf(pdf_path, progress=gr.Progress()):
-    """
-    Extract text from PDF
-    Args:
-    pdf_path (str): Path to the PDF file
-    progress (gr.Progress): Progress bar
-    Returns:
-    str: Extracted text from the PDF
-    """
-    progress(0, desc="Starting...")
     images = convert_from_path(pdf_path)
-    texts = []
-    for image in progress.tqdm(images):
-        extracted_text = extract_text_from_image(image)
-        texts.append(extracted_text)
-    return "\n".join(texts)
-model_description = """
-This is a demo for the Arabic Small Nougat model. It is an end-to-end OCR model that can extract text from images and PDFs.
-- The model is trained on the [Khatt dataset](https://huggingface.co/datasets/Fakhraddin/khatt) and custom made dataset.
-- The model is a finetune of [facebook/nougat-small](https://huggingface.co/facebook/nougat-small) model.
-**Note**: The model is a prototype in my book and may not work well on all types of images and PDFs. **Check the output carefully before using it for any serious work.**
 """
-example_images = [Image.open(Path(__file__).parent / "book_page.jpeg")]
-with gr.Blocks(title="Arabic Small Nougat") as demo:
-    gr.HTML("<h1 style='text-align: center'>Arabic End-to-End Structured OCR for textbooks</h1>")
     gr.Markdown(model_description)
     with gr.Tab("Extract Text from Image"):
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(label="Input Image", type="pil")
                 image_submit_button = gr.Button(value="Submit", variant="primary")
-            output = gr.Markdown(label="Output Markdown", rtl=True)
-        image_submit_button.click(extract_text_from_image, inputs=[input_image], outputs=output)
-        gr.Examples(example_images, [input_image], output, extract_text_from_image, cache_examples=True)
     with gr.Tab("Extract Text from PDF"):
         with gr.Row():
             with gr.Column():
                 pdf = gr.File(label="Input PDF", type="filepath")
                 pdf_submit_button = gr.Button(value="Submit", variant="primary")
-            output = gr.Markdown(label="Output Markdown", rtl=True)
-        pdf_submit_button.click(extract_text_from_pdf, inputs=[pdf], outputs=output)
 demo.queue().launch(share=False)

+from transformers import (
+    NougatProcessor,
+    VisionEncoderDecoderModel,
+    TextIteratorStreamer,
+)
 import gradio as gr
 import torch
 from pathlib import Path
 from pdf2image import convert_from_path
 import spaces
+from threading import Thread
+models_supported = {
+    "arabic-small-nougat": [
+        NougatProcessor.from_pretrained("MohamedRashad/arabic-small-nougat"),
+        VisionEncoderDecoderModel.from_pretrained("MohamedRashad/arabic-small-nougat"),
+    ],
+    "arabic-base-nougat": [
+        NougatProcessor.from_pretrained("MohamedRashad/arabic-base-nougat"),
+        VisionEncoderDecoderModel.from_pretrained(
+            "MohamedRashad/arabic-base-nougat",
+            torch_dtype=torch.bfloat16,
+            attn_implementation={"decoder": "flash_attention_2", "encoder": "eager"},
+        ),
+    ],
+    "arabic-large-nougat": [
+        NougatProcessor.from_pretrained("MohamedRashad/arabic-large-nougat"),
+        VisionEncoderDecoderModel.from_pretrained(
+            "MohamedRashad/arabic-large-nougat",
+            torch_dtype=torch.bfloat16,
+            attn_implementation={"decoder": "flash_attention_2", "encoder": "eager"},
+        ),
+    ],
+}
 @spaces.GPU
+def extract_text_from_image(image, model_name):
+    print(f"Extracting text from image using model: {model_name}")
+    processor, model = models_supported[model_name]
+    context_length = model.decoder.config.max_position_embeddings
+    torch_dtype = model.dtype
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    pixel_values = (
+        processor(image, return_tensors="pt").pixel_values.to(torch_dtype).to(device)
     )
+    streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True)
+    # Start generation in a separate thread
+    generation_kwargs = {
+        "pixel_values": pixel_values,
+        "min_length": 1,
+        "max_new_tokens": context_length,
+        "streamer": streamer,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Yield tokens as they become available
+    output = ""
+    for token in streamer:
+        output += token
+        yield output
+    thread.join()
+@spaces.GPU
+def extract_text_from_pdf(pdf_path, model_name):
+    processor, model = models_supported[model_name]
+    context_length = model.decoder.config.max_position_embeddings
+    torch_dtype = model.dtype
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True)
+    print(f"Extracting text from PDF: {pdf_path}")
     images = convert_from_path(pdf_path)
+    pdf_output = ""
+    for image in images:
+        pixel_values = (
+            processor(image, return_tensors="pt")
+            .pixel_values.to(torch_dtype)
+            .to(device)
+        )
+        # Start generation in a separate thread
+        generation_kwargs = {
+            "pixel_values": pixel_values,
+            "min_length": 1,
+            "max_new_tokens": context_length,
+            "streamer": streamer,
+        }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        # Yield tokens as they become available
+        for token in streamer:
+            pdf_output += token
+            yield pdf_output
+        thread.join()
+        pdf_output += "\n\n"
+        yield pdf_output
+model_description = """This is the official demo for the Arabic Nougat models. It is an end-to-end Markdown Extraction model that extracts text from images or PDFs and write them in Markdown.
+There are three models available:
+- [arabic-small-nougat](https://huggingface.co/MohamedRashad/arabic-small-nougat): A small model that is faster but less accurate (a finetune from [facebook/nougat-small](https://huggingface.co/facebook/nougat-small)).
+- [arabic-base-nougat](https://huggingface.co/MohamedRashad/arabic-base-nougat): A base model that is more accurate but slower (a finetune from [facebook/nougat-base](https://huggingface.co/facebook/nougat-base)).
+- [arabic-large-nougat](https://huggingface.co/MohamedRashad/arabic-large-nougat): The largest of the three (Made from scratch using [riotu-lab/Aranizer-PBE-86k](https://huggingface.co/riotu-lab/Aranizer-PBE-86k) tokenizer and a larger transformer decoder model).
+**Disclaimer**: These models hallucinate text and are not perfect. They are trained on a mix of synthetic and real data and may not work well on all types of images.
 """
+example_images = list(Path(__file__).parent.glob("*.jpeg"))
+with gr.Blocks(title="Arabic Nougat") as demo:
+    gr.HTML(
+        "<h1 style='text-align: center'>Arabic End-to-End Structured OCR for textbooks</h1>"
+    )
     gr.Markdown(model_description)
     with gr.Tab("Extract Text from Image"):
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(label="Input Image", type="pil")
+                model_dropdown = gr.Dropdown(
+                    label="Model", choices=list(models_supported.keys()), value=None
+                )
                 image_submit_button = gr.Button(value="Submit", variant="primary")
+            output = gr.Markdown(label="Output Markdown", rtl=True)
+        image_submit_button.click(
+            extract_text_from_image,
+            inputs=[input_image, model_dropdown],
+            outputs=output,
+        )
+        gr.Examples(
+            example_images,
+            [input_image],
+            output,
+            extract_text_from_image,
+            cache_examples=False,
+        )
     with gr.Tab("Extract Text from PDF"):
         with gr.Row():
             with gr.Column():
                 pdf = gr.File(label="Input PDF", type="filepath")
+                model_dropdown = gr.Dropdown(
+                    label="Model", choices=list(models_supported.keys()), value=None
+                )
                 pdf_submit_button = gr.Button(value="Submit", variant="primary")
+            output = gr.Markdown(label="Output Markdown", rtl=True)
+        pdf_submit_button.click(
+            extract_text_from_pdf, inputs=[pdf, model_dropdown], outputs=output
+        )
 demo.queue().launch(share=False)

book_page.jpeg → book_page1.jpeg RENAMED Viewed

File without changes

book_page2.jpeg ADDED Viewed

book_page3.jpeg ADDED Viewed

book_page4.jpeg ADDED Viewed

book_page5.jpeg ADDED Viewed