MohamedRashad
/

arabic-large-nougat

vision-encoder-decoder

image-text-to-text

markdown-extraction

vision-transformers

Inference Endpoints

Model card Files Files and versions Community

MohamedRashad commited on Nov 8

Commit

e825481

•

1 Parent(s): 9d6dab6

Update README.md

Files changed (1) hide show

README.md +11 -3

README.md CHANGED Viewed

@@ -36,7 +36,11 @@ from transformers import NougatProcessor, VisionEncoderDecoderModel
 # Load the model and processor
 processor = NougatProcessor.from_pretrained("MohamedRashad/arabic-large-nougat")
-model = VisionEncoderDecoderModel.from_pretrained("MohamedRashad/arabic-large-nougat", torch_dtype=torch.bfloat16, attn_implementation={"decoder": "flash_attention_2", "encoder": "eager"})
 # Get the max context length of the model & dtype of the weights
 context_length = model.decoder.config.max_position_embeddings
@@ -46,10 +50,13 @@ torch_dtype = model.dtype
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 def predict(img_path):
     # prepare PDF image for the model
     image = Image.open(img_path)
-    pixel_values = processor(image, return_tensors="pt").pixel_values.to(torch_dtype).to(device)
     # generate transcription
     outputs = model.generate(
@@ -61,10 +68,11 @@ def predict(img_path):
     )
     page_sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-    page_sequence = processor.post_process_generation(page_sequence, fix_markdown=False)
     return page_sequence
 print(predict("path/to/page_image.jpg"))
 ```

 # Load the model and processor
 processor = NougatProcessor.from_pretrained("MohamedRashad/arabic-large-nougat")
+model = VisionEncoderDecoderModel.from_pretrained(
+    "MohamedRashad/arabic-large-nougat",
+    torch_dtype=torch.bfloat16,
+    attn_implementation={"decoder": "flash_attention_2", "encoder": "eager"},
+)
 # Get the max context length of the model & dtype of the weights
 context_length = model.decoder.config.max_position_embeddings
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 def predict(img_path):
     # prepare PDF image for the model
     image = Image.open(img_path)
+    pixel_values = (
+        processor(image, return_tensors="pt").pixel_values.to(torch_dtype).to(device)
+    )
     # generate transcription
     outputs = model.generate(
     )
     page_sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0]
     return page_sequence
 print(predict("path/to/page_image.jpg"))
 ```