File size: 6,219 Bytes
43c7b9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b49c3e
43c7b9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b49c3e
43c7b9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b49c3e
43c7b9d
 
 
 
6b49c3e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Import needed library
from PIL import Image
import gradio as gr
import torch
import requests
import re
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, TrOCRProcessor, VisionEncoderDecoderModel

# load image examples
img_urls_1 = ['https://i.pinimg.com/564x/f7/f5/bd/f7f5bd929e05a852ff423e6e02deea54.jpg', 'https://i.pinimg.com/564x/b4/29/69/b4296962cb76a72354a718109835caa3.jpg',
        'https://i.pinimg.com/564x/f2/68/8e/f2688eccd6dd60fdad89ef78950b9ead.jpg']
for idx1, url1 in enumerate(img_urls_1):
  image = Image.open(requests.get(url1, stream=True).raw)
  image.save(f"image_{idx1}.png")

# load image examples
img_urls_2 = ['https://i.pinimg.com/564x/14/b0/07/14b0075ccd5ea35f7deffc9e5bd6de30.jpg', 'https://newsimg.bbc.co.uk/media/images/45510000/jpg/_45510184_the_writings_466_180.jpg',
        'https://cdn.shopify.com/s/files/1/0047/1524/9737/files/Cetaphil_Face_Wash_Ingredients_Optimized.png?v=1680923920', 'https://github.com/kawther12h/Image_Captioning-and-Text_Recognition/blob/main/handText22.jpg?raw=true','https://github.com/kawther12h/Image_Captioning-and-Text_Recognition/blob/main/handText11.jpg?raw=true']
for idx2, url2 in enumerate(img_urls_2):
  image = Image.open(requests.get(url2, stream=True).raw)
  image.save(f"tx_image_{idx2}.png")

# Load Blip model and processor for captioning
processor_blip = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model_blip = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# Load marefa model for translation (English to Arabic)
translate = pipeline("translation",model="marefa-nlp/marefa-mt-en-ar")

def caption_and_translate(img, min_len, max_len):
    # Generate English caption
    raw_image = Image.open(img).convert('RGB')
    inputs_blip = processor_blip(raw_image, return_tensors="pt")
    out_blip = model_blip.generate(**inputs_blip, min_length=min_len, max_length=max_len)
    english_caption = processor_blip.decode(out_blip[0], skip_special_tokens=True)


    # Translate caption from English to Arabic
    arabic_caption = translate(english_caption)
    arabic_caption = arabic_caption[0]['translation_text']

    translated_caption = f'<div dir="rtl">{arabic_caption}</div>'


    # Return both captions
    return english_caption, translated_caption


# Gradio interface with multiple outputs
img_cap_en_ar = gr.Interface(
    fn=caption_and_translate,
    inputs=[gr.Image(type='filepath', label='Image'),
            gr.Slider(label='Minimum Length', minimum=1, maximum=500, value=30),
            gr.Slider(label='Maximum Length', minimum=1, maximum=500, value=100)],
    outputs=[gr.Textbox(label='English Caption'),
             gr.HTML(label='Arabic Caption')],
    title='Image Captioning | وصف الصورة',
    description="Upload an image to generate an English & Arabic caption | قم برفع صورة وأرسلها ليظهر لك وصف للصورة",
    examples =[["image_2.png"]]
)


# Load the model
text_rec = pipeline("image-to-text", model="jinhybr/OCR-Donut-CORD")

# Load MarianMT model for translation (English to Arabic)
translate = pipeline("translation",model="marefa-nlp/marefa-mt-en-ar")

# Function to process the image and extract text
def extract_text(image):
    # Pass the image to the pipeline
    result = text_rec(image)

    # Extract the plain text and remove tags
    text = result[0]['generated_text']
    text = re.sub(r'<[^>]*>', '', text)  # Remove all HTML tags

    # Translate extracted text from English to Arabic
    arabic_text3 = translate(text)
    arabic_text3 = arabic_text3[0]['translation_text']
    htranslated_text = f'<div dir="rtl">{arabic_text3}</div>'

    # Return the extracted text
    return text,htranslated_text

# Define the Gradio interface
text_recognition = gr.Interface(
    fn=extract_text,                    # The function that processes the image
    inputs=gr.Image(type="pil"),        # Input is an image (PIL format)
    outputs=[gr.Textbox(label='Extracted text'), gr.HTML(label= 'Translateted of Extracted text ')],   # Output is text
    title="Text Extraction and Translation | إستخراج النص وترجمتة",
    description="Upload an image then Submet to extract text and translate it to Arabic| قم برفع الصورة وأرسلها ليظهر لك النص من الصورة",
    examples =[["tx_image_0.png"]],
)

# Load trocr model for handwritten text extraction
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')

# Load MarianMT model for translation (English to Arabic)
translate = pipeline("translation",model="marefa-nlp/marefa-mt-en-ar")

def recognize_handwritten_text(image2):
  # process and and extract text
  pixel_values = processor(images=image2, return_tensors="pt").pixel_values
  generated_ids = model.generate(pixel_values)
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

  # Translate extracted text from English to Arabic
  arabic_text2 = translate(generated_text)
  arabic_text2 = arabic_text2[0]['translation_text']
  htranslated_text = f'<div dir="rtl">{arabic_text2}</div>'

  # Return the extracted text and translated text
  return generated_text, htranslated_text

# Gradio interface with image upload input and text output
handwritten_rec = gr.Interface(
    fn=recognize_handwritten_text,
    inputs=gr.Image(label="Upload Image"),
    outputs=[gr.Textbox(label='English Text'),
             gr.HTML(label='Arabic Text')],
    title="Handwritten Text Extraction | | إستخراج النص المكتوب بخط اليد وترجمتة",
    description="Upload an image then Submet to extract text and translate it to Arabic| قم برفع الصورة وأرسلها ليظهر لك النص من الصورة",
    examples =[["tx_image_1.png"]]
)

# Combine all interfaces into a tabbed interface
demo = gr.TabbedInterface([img_cap_en_ar, text_recognition, handwritten_rec], ["Extract_Caption", " Extract_Digital_text", " Extract_HandWritten_text"])
demo.launch(debug=True)
if __name__ == "__main__":
    app.run(host= "0.0.0.0", port=7860)