arj7192 commited on
Commit
7c2214c
·
verified ·
1 Parent(s): 8cfb9de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -669
app.py CHANGED
@@ -1,670 +1,2 @@
1
- import spaces
2
  import os
3
- import requests
4
- import gradio as gr
5
- import easyocr
6
- import numpy as np
7
- import cv2
8
- import base64
9
- import torch
10
- from shapely import Polygon
11
- from ultralytics import YOLO
12
-
13
- from io import BytesIO
14
- from openai import OpenAI
15
- from gradio_imageslider import ImageSlider
16
- from PIL import Image, ImageDraw, ImageFont
17
-
18
- from diffusers.utils import load_image, check_min_version
19
- from controlnet_flux import FluxControlNetModel
20
- from transformer_flux import FluxTransformer2DModel
21
- from pipeline_flux_controlnet_inpaint import FluxControlNetInpaintingPipeline
22
-
23
- import huggingface_hub
24
- huggingface_hub.login(os.getenv('HF_TOKEN_FLUX'))
25
-
26
- import gdown
27
-
28
- def download_from_gdrive(file_id, destination):
29
- """
30
- Download a file from Google Drive using gdown.
31
-
32
- Args:
33
- file_id (str): The Google Drive file ID.
34
- destination (str): Local path to save the downloaded file.
35
- """
36
- url = f"https://drive.google.com/uc?id={file_id}"
37
- gdown.download(url, destination, quiet=True)
38
-
39
-
40
- files = {
41
- "speech_bubble_model": "speech_bubble_model.pt",
42
- "craft_mlt_25k": "craft_mlt_25k.pth",
43
- "english_g2": "english_g2.pth",
44
- "korean_g2": "korean_g2.pth",
45
- "latin_g2": "latin_g2.pth",
46
- "zh_sim_g2": "zh_sim_g2.pth",
47
- }
48
-
49
- token = os.getenv("HF_GITHUB_TOKEN")
50
-
51
- # Download each file
52
- for filename, destination_path in files.items():
53
- download_from_gdrive(os.getenv(filename), destination_path)
54
-
55
- bubble_detection_model = YOLO("speech_bubble_model.pt")
56
-
57
- language_to_ocr = {
58
- 'Simplified Chinese': 'ch_sim',
59
- 'Traditional Chinese': 'ch_tra',
60
- 'Korean': 'ko',
61
- 'Japanese': 'ja',
62
- 'English': 'en',
63
- }
64
-
65
- OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
66
-
67
- MARKDOWN = """
68
- # Made by Nativ
69
- """
70
-
71
- check_min_version("0.30.2")
72
- transformer = FluxTransformer2DModel.from_pretrained(
73
- "black-forest-labs/FLUX.1-dev", subfolder='transformer', torch_dytpe=torch.bfloat16
74
- )
75
-
76
- cuda_device =torch.device("cuda")
77
- # Build pipeline
78
- controlnet = FluxControlNetModel.from_pretrained("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", torch_dtype=torch.bfloat16)
79
- pipe = FluxControlNetInpaintingPipeline.from_pretrained(
80
- "black-forest-labs/FLUX.1-dev",
81
- controlnet=controlnet,
82
- transformer=transformer,
83
- torch_dtype=torch.bfloat16
84
- ).to(cuda_device)
85
- pipe.transformer.to(torch.bfloat16)
86
- pipe.controlnet.to(torch.bfloat16)
87
-
88
-
89
- def hex_to_rgba(hex_color):
90
- print(hex_color)
91
- """Convert hex color to RGBA tuple."""
92
- hex_color = hex_color.lstrip('#') # Remove '#' if present
93
- if len(hex_color) == 6: # Handle `#RRGGBB`
94
- r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
95
- return r, g, b, 255 # Add full opacity (alpha = 255)
96
- elif len(hex_color) == 8: # Handle `#RRGGBBAA` if alpha is included
97
- r, g, b, a = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16), int(hex_color[6:8], 16)
98
- return r, g, b, a
99
- else:
100
- raise ValueError(f"Invalid hex color format: {hex_color}")
101
-
102
- import re
103
-
104
- def rgba_to_tuple(rgba_color):
105
- """Convert rgba(r, g, b, a) string to an (R, G, B, A) tuple."""
106
- # Match the rgba format: rgba(r, g, b, a)
107
- match = re.match(r'rgba\(([\d.]+),\s*([\d.]+),\s*([\d.]+),\s*([\d.]+)\)', rgba_color)
108
- if not match:
109
- raise ValueError(f"Invalid RGBA color format: {rgba_color}")
110
-
111
- r, g, b, a = map(float, match.groups())
112
- r, g, b = int(r), int(g), int(b)
113
- a = int(a * 255) # Scale alpha from [0, 1] to [0, 255]
114
- return r, g, b, a
115
-
116
-
117
- def color_to_rgba(color):
118
- """Convert a color string (hex or rgba) to an RGBA tuple."""
119
- if color.startswith("#"): # Hex format
120
- return hex_to_rgba(color)
121
- elif color.startswith("rgba"): # rgba(r, g, b, a) format
122
- return rgba_to_tuple(color)
123
- else:
124
- raise ValueError(f"Unsupported color format: {color}")
125
-
126
-
127
- def localize_boxes(merged_results, img_boxes, source_language, target_language):
128
- # Convert image to base64
129
- buffered = BytesIO()
130
- img_boxes.save(buffered, format="PNG")
131
- img_str = base64.b64encode(buffered.getvalue()).decode()
132
-
133
- print(merged_results)
134
-
135
- prompt = f"""You are an expert translator and localization specialist with deep understanding of both {source_language} and {target_language} cultures.
136
- Task: Translate the detected text while preserving the cultural context and maintaining visual harmony. Make the results in capital letters.
137
- Source Text and Coordinates:
138
- {merged_results}
139
- Requirements:
140
- 1. Maintain the original meaning and tone while adapting to {target_language} cultural context
141
- 2. Keep translations concise and visually balanced (similar character length when possible)
142
- 3. Preserve any:
143
- - Brand names
144
- - Product names
145
- - Technical terms
146
- - Numbers and units
147
- 4. Consider the visual context from the provided image
148
- 5. Use appropriate formality level for {target_language}
149
- 6. Maintain any special formatting (if present)
150
- Format your response EXACTLY as a JSON-like list of dictionaries. Keep the box coordinates EXACTLY as they are, do not change them, only translate the text.
151
- [{{'box': [[x0, y0], [x1, y0], [x1, y1], [x0, y1]], 'text': 'translated_text'}}]
152
- Important: Only output the JSON format above, no explanations or additional text."""
153
-
154
- client = OpenAI(api_key=OPENAI_API_KEY)
155
-
156
- response = client.chat.completions.create(
157
- model="gpt-4o",
158
- messages=[
159
- {
160
- "role": "user",
161
- "content": [
162
- {"type": "text", "text": prompt},
163
- {
164
- "type": "image_url",
165
- "image_url": {
166
- "url": f"data:image/png;base64,{img_str}"
167
- }
168
- }
169
- ]
170
- }
171
- ],
172
- max_tokens=1000,
173
- temperature=0
174
- )
175
-
176
- try:
177
- translation_text = response.choices[0].message.content
178
- translation_text = translation_text.replace("```json", "").replace("```", "").strip()
179
- translated_results = eval(translation_text)
180
- return translated_results
181
- except Exception as e:
182
- print(f"Error parsing GPT-4o response: {e}")
183
- return merged_results
184
-
185
- def merge_boxes(boxes, image_shape, distance_threshold=10):
186
- """Merge boxes that are close to each other and return their associated text"""
187
- if not boxes:
188
- return []
189
-
190
- # Extract boxes and create mapping to original data
191
- boxes_only = [box[0] for box in boxes]
192
- texts = [box[1] for box in boxes] # Extract the text content
193
-
194
- # Create a binary mask of all boxes
195
- height, width = image_shape[:2]
196
- mask = np.zeros((height, width), dtype=np.uint8)
197
-
198
- # Draw all boxes on mask and create a mapping of pixel positions to box indices
199
- box_indices_map = {} # Will store which original box each pixel belongs to
200
- for idx, coords in enumerate(boxes_only):
201
- pts = np.array(coords, dtype=np.int32)
202
- cv2.fillPoly(mask, [pts], 255)
203
- # Store the indices of boxes for each filled pixel
204
- y_coords, x_coords = np.where(mask == 255)
205
- for y, x in zip(y_coords, x_coords):
206
- if (y, x) not in box_indices_map:
207
- box_indices_map[(y, x)] = []
208
- box_indices_map[(y, x)].append(idx)
209
-
210
- # Dilate to connect nearby components
211
- kernel = np.ones((distance_threshold, distance_threshold), np.uint8)
212
- dilated = cv2.dilate(mask, kernel, iterations=1)
213
-
214
- # Find connected components
215
- num_labels, labels = cv2.connectedComponents(dilated)
216
-
217
- # Create new merged boxes with their associated text
218
- merged_results = []
219
- for label in range(1, num_labels): # Skip background (0)
220
- points = np.where(labels == label)
221
- if len(points[0]): # If component is not empty
222
- y0, x0 = points[0].min(), points[1].min()
223
- y1, x1 = points[0].max(), points[1].max()
224
- # Add small padding
225
- x0 = max(0, x0 - 2)
226
- y0 = max(0, y0 - 2)
227
- x1 = min(width, x1 + 2)
228
- y1 = min(height, y1 + 2)
229
-
230
- # Find all original boxes that overlap with this merged box
231
- box_indices = set()
232
- for y in range(y0, y1+1):
233
- for x in range(x0, x1+1):
234
- if (y, x) in box_indices_map:
235
- box_indices.update(box_indices_map[(y, x)])
236
-
237
- # Combine text from all overlapping boxes
238
- combined_text = ' '.join([texts[idx] for idx in box_indices])
239
-
240
- merged_results.append({
241
- 'box': [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
242
- 'text': combined_text
243
- })
244
- return merged_results
245
-
246
- def is_box_inside_yolo(box, yolo_boxes, overlap_threshold=0.5):
247
- """
248
- Check if a text box is inside any of the YOLO-detected speech bubbles.
249
- box: [[x0,y0], [x1,y0], [x1,y1], [x0,y1]]
250
- yolo_boxes: list of YOLO boxes in xywh format
251
- overlap_threshold: minimum overlap ratio required to consider the text inside bubble
252
- """
253
- text_poly = Polygon(box)
254
- text_area = text_poly.area
255
-
256
- for yolo_box in yolo_boxes:
257
- x_center, y_center, width, height = yolo_box
258
- x1, y1 = x_center - width / 2, y_center - height / 2
259
- x2, y2 = x_center + width / 2, y_center + height / 2
260
- bubble_box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
261
- bubble_poly = Polygon(bubble_box)
262
-
263
- # Calculate intersection
264
- if text_poly.intersects(bubble_poly):
265
- intersection = text_poly.intersection(bubble_poly)
266
- overlap_ratio = intersection.area / text_area
267
- if overlap_ratio >= overlap_threshold:
268
- return True
269
-
270
- return False
271
-
272
- def remove_text_regions(image, boxes, yolo_boxes):
273
- """Fill detected text regions with white"""
274
- img_removed = image.copy()
275
- mask = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
276
-
277
- # Fill all detected boxes with white
278
- for box in boxes:
279
- pts = np.array(box[0], dtype=np.int32)
280
- if is_box_inside_yolo(box[0], yolo_boxes):
281
- cv2.fillPoly(img_removed, [pts], (255, 255, 255, 255))
282
- cv2.fillPoly(mask, [pts], (255, 255, 255, 255))
283
-
284
- img_removed_rgb = cv2.cvtColor(img_removed, cv2.COLOR_BGR2RGB)
285
-
286
- return img_removed_rgb, mask
287
-
288
- def fit_text_to_box(text, merged_coordinates, font_path, font_color, angle=0):
289
- """
290
- Adjusts the text to fit optimally inside the given box dimensions.
291
-
292
- Args:
293
- text (str): The text to fit.
294
- box_size (tuple): A tuple (width, height) specifying the box dimensions.
295
- font_path (str): Path to the font file to be used.
296
-
297
- Returns:
298
- PIL.Image: An image with the text fitted inside the box.
299
- """
300
- width, height = merged_coordinates[1][0] - merged_coordinates[0][0], merged_coordinates[2][1] - merged_coordinates[1][1]
301
- font_size = 1
302
-
303
- # Create a dummy image to measure text size
304
- dummy_image = Image.new('RGB', (width, height))
305
- draw = ImageDraw.Draw(dummy_image)
306
-
307
- # Load a small font initially
308
- font = ImageFont.truetype(font_path, font_size)
309
-
310
- while True:
311
- # Break text into lines that fit within the width
312
- words = text.split()
313
- lines = []
314
- current_line = []
315
- for word in words:
316
- test_line = " ".join(current_line + [word])
317
- test_width = draw.textlength(test_line, font=font)
318
- if test_width <= width:
319
- current_line.append(word)
320
- else:
321
- lines.append(" ".join(current_line))
322
- current_line = [word]
323
- if current_line:
324
- lines.append(" ".join(current_line))
325
-
326
- # Calculate total height required for the lines
327
- line_height = font.getbbox('A')[3] + 5 # Add line spacing
328
- total_height = len(lines) * line_height
329
-
330
- # Check if text fits within the height
331
- if total_height > height or any(draw.textlength(line, font=font) > width for line in lines):
332
- break
333
-
334
- # Increment font size
335
- font_size += 1
336
- font = ImageFont.truetype(font_path, font_size)
337
-
338
- # Use the last fitting font
339
- font_size -= 1
340
- font = ImageFont.truetype(font_path, font_size)
341
-
342
- # Create the final image with a transparent background
343
- image = Image.new('RGBA', (width, height), (255, 255, 255, 0))
344
- draw = ImageDraw.Draw(image)
345
-
346
- # Center the text vertically and horizontally
347
- lines = []
348
- current_line = []
349
- for word in text.split():
350
- test_line = " ".join(current_line + [word])
351
- if draw.textlength(test_line, font=font) <= width:
352
- current_line.append(word)
353
- else:
354
- lines.append(" ".join(current_line))
355
- current_line = [word]
356
- if current_line:
357
- lines.append(" ".join(current_line))
358
-
359
- line_height = font.getbbox('A')[3] + 5
360
- total_text_height = len(lines) * line_height
361
- y_offset = (height - total_text_height) // 2
362
-
363
- for line in lines:
364
- text_width = draw.textlength(line, font=font)
365
- x_offset = (width - text_width) // 2
366
- draw.text((x_offset, y_offset), line, font=font, fill=font_color)
367
- y_offset += line_height
368
-
369
- rotated_image = image.rotate(0, expand=True)
370
-
371
- return rotated_image
372
-
373
- def shorten_box(merged_coordinates, pct=0):
374
- # Calculate the center of the box
375
- center_x = (merged_coordinates[0][0] + merged_coordinates[2][0]) / 2
376
- center_y = (merged_coordinates[0][1] + merged_coordinates[2][1]) / 2
377
-
378
- # Calculate the width and height of the box
379
- width = merged_coordinates[1][0] - merged_coordinates[0][0]
380
- height = merged_coordinates[2][1] - merged_coordinates[1][1]
381
-
382
- # Shrink width and height by 10%
383
- new_width = width * 1-pct/100.
384
- new_height = height * 1-pct/100.
385
-
386
- # Calculate the new coordinates
387
- merged_coordinates_new = np.array([
388
- [center_x - new_width / 2, center_y - new_height / 2], # Top-left
389
- [center_x + new_width / 2, center_y - new_height / 2], # Top-right
390
- [center_x + new_width / 2, center_y + new_height / 2], # Bottom-right
391
- [center_x - new_width / 2, center_y + new_height / 2] # Bottom-left
392
- ], dtype=int)
393
-
394
- return merged_coordinates_new
395
-
396
-
397
- def detect_and_show_text(reader, image):
398
- """Detect text and show bounding boxes"""
399
- if isinstance(image, Image.Image):
400
- img_array = np.array(image)
401
- else:
402
- img_array = image
403
-
404
- # Get YOLO results first
405
- yolo_results = bubble_detection_model(img_array, conf=0.7)[0]
406
- yolo_boxes = yolo_results.boxes.xywh.cpu().numpy() # Get YOLO boxes in xywh format
407
-
408
- # Detect text
409
- results = reader.readtext(img_array, text_threshold=0.6)
410
-
411
- # Create visualization
412
- img_boxes = img_array.copy()
413
-
414
- # Ensure we're working with RGB
415
- if len(img_array.shape) == 3:
416
- if img_array.shape[2] == 3: # If it's a 3-channel image
417
- img_boxes = cv2.cvtColor(img_boxes, cv2.COLOR_BGR2RGB)
418
-
419
- # Draw original EasyOCR boxes on img_boxes
420
- for result in results:
421
- pts = np.array(result[0], dtype=np.int32)
422
- cv2.polylines(img_boxes, [pts], isClosed=True, color=(0, 255, 0), thickness=2) # Draw original boxes in green
423
-
424
- # Remove text and merge boxes for visualization
425
- img_removed, mask = remove_text_regions(img_array, results, yolo_boxes)
426
- merged_results = merge_boxes(results, img_array.shape)
427
-
428
- # Draw merged detection boxes and text (if needed)
429
- for merged_result in merged_results:
430
- pts = np.array(merged_result['box'], dtype=np.int32)
431
- # Color the box red if inside bubble, blue if outside
432
- color = (0, 0, 255) if is_box_inside_yolo(merged_result['box'], yolo_boxes) else (255, 0, 0)
433
- cv2.polylines(img_boxes, [pts], True, color, 2) # Draw merged boxes in red or blue
434
-
435
- # Convert to RGB
436
- img_boxes_rgb = cv2.cvtColor(img_boxes, cv2.COLOR_BGR2RGB)
437
- img_removed_rgb = cv2.cvtColor(img_removed, cv2.COLOR_BGR2RGB)
438
- mask_rgba = cv2.cvtColor(mask, cv2.COLOR_RGB2RGBA)
439
-
440
- # Get YOLO visualization without labels
441
- bubbles_img = yolo_results.plot(labels=False)
442
-
443
- # Convert to PIL Images
444
- img_boxes_pil = Image.fromarray(img_boxes_rgb)
445
- img_removed_pil = Image.fromarray(img_removed_rgb)
446
- bubbles_img_pil = Image.fromarray(bubbles_img)
447
- mask_pil = Image.fromarray(mask_rgba)
448
-
449
- return img_boxes_pil, bubbles_img_pil, img_removed_pil, merged_results, mask_pil
450
-
451
-
452
- def position_text_back(text, merged_coordinates, inpainted_image, font_path, font_color):
453
- coords = shorten_box(merged_coordinates)
454
- top_left_coords = coords[0]
455
- text_image = fit_text_to_box(text, coords, font_path, font_color)
456
-
457
- # Create a transparent layer to blend
458
- layer = Image.new("RGBA", inpainted_image.size, (0, 0, 0, 0))
459
-
460
- # Paste the text image onto the transparent layer at the specified position
461
- layer.paste(text_image, tuple(top_left_coords), mask=text_image)
462
-
463
- # Ensure both images are in "RGBA" mode
464
- if inpainted_image.mode != "RGBA":
465
- inpainted_image = inpainted_image.convert("RGBA")
466
- if layer.mode != "RGBA":
467
- layer = layer.convert("RGBA")
468
-
469
- # Blend the transparent layer with the inpainted image
470
- blended_image = Image.alpha_composite(inpainted_image, layer)
471
-
472
- return blended_image
473
-
474
- @spaces.GPU()
475
- def process(image, mask,
476
- prompt="background",
477
- negative_prompt="text",
478
- num_inference_steps=15,
479
- controlnet_conditioning_scale=0.9,
480
- guidance_scale=3.5,
481
- seed=124,
482
- true_guidance_scale=3.5
483
- ):
484
- size = (768, 768)
485
- image_pil = Image.fromarray(image)
486
- image_or = image_pil.copy()
487
-
488
- image_pil = image_pil.convert("RGB").resize(size)
489
- mask = mask.convert("RGB").resize(size)
490
- generator = torch.Generator(device="cuda").manual_seed(seed)
491
- result = pipe(
492
- prompt=prompt,
493
- height=size[1],
494
- width=size[0],
495
- control_image=image_pil,
496
- control_mask=mask,
497
- num_inference_steps=num_inference_steps,
498
- generator=generator,
499
- controlnet_conditioning_scale=controlnet_conditioning_scale,
500
- guidance_scale=guidance_scale,
501
- negative_prompt=negative_prompt,
502
- true_guidance_scale=true_guidance_scale
503
- ).images[0]
504
-
505
- return result.resize((image_or.size[:2]))
506
-
507
-
508
- @spaces.GPU()
509
- def process_image(image, source_language, target_language, mode, font, font_color, num_inference_steps=15):
510
- """Main processing function for Gradio"""
511
- if image is None:
512
- return None, None, None, []
513
-
514
- # Initialize reader (equivalent to what handle_localization did)
515
- easy_ocr_lan = language_to_ocr.get(source_language, 'en')
516
- reader = easyocr.Reader([easy_ocr_lan], model_storage_directory='.', gpu=False)
517
-
518
- # Detect text and get results
519
- img_with_boxes, img_bubbles, img_removed_text, merged_results, mask = detect_and_show_text(reader, image)
520
-
521
- if mode == "Basic (speech bubbles only)":
522
- img_inpainted = img_removed_text
523
- else:
524
- img_inpainted = process(image, mask, num_inference_steps=num_inference_steps)
525
-
526
- font_rgba = color_to_rgba(font_color) # Convert hex to RGBA
527
-
528
- # Get translations
529
- translations = localize_boxes(
530
- merged_results,
531
- img_with_boxes,
532
- source_language,
533
- target_language
534
- )
535
-
536
- # Create initial result with translations
537
- final_result = img_inpainted.copy()
538
- for translation in translations:
539
- box = translation['box']
540
- text = translation['text']
541
- final_result = position_text_back(text, box, final_result, font_path=f"fonts/{font}.ttf", font_color=font_rgba)
542
-
543
- # Return all results directly (no need to store in session state)
544
- return img_with_boxes, img_bubbles, img_inpainted, final_result, translations, np.array(mask)
545
-
546
-
547
- def update_translations(image, edited_texts, translations_list, img_removed_text, font, font_color):
548
- """Update the image with edited translations"""
549
- if image is None or img_removed_text is None:
550
- return None
551
-
552
- # Convert numpy array back to PIL Image
553
- img_removed = Image.fromarray(img_removed_text)
554
- final_result = img_removed.copy()
555
-
556
- font_rgba = color_to_rgba(font_color) # Convert hex to RGBA
557
-
558
- # Update the translations with edited texts
559
- for trans, new_text in zip(translations_list, edited_texts.split('\n')):
560
- trans['text'] = new_text.strip()
561
- box = trans['box']
562
- final_result = position_text_back(new_text, box, final_result, font_path=f"fonts/{font}.ttf", font_color=font_rgba)
563
-
564
- return np.array(final_result)
565
-
566
-
567
- with gr.Blocks(title="Nativ Demo - Localize text within Comics") as demo:
568
- # Store translations list in state
569
- translations_state = gr.State([])
570
-
571
- gr.Markdown("# Nativ Demo - Localize text within Comics")
572
-
573
- with gr.Row():
574
- with gr.Column():
575
- # Input components
576
- input_image = gr.Image(type="numpy", label="Upload Image")
577
- source_language = gr.Dropdown(
578
- choices=['Simplified Chinese', 'Traditional Chinese', 'Korean', 'Japanese', 'English'],
579
- value='Simplified Chinese',
580
- label="Source Language"
581
- )
582
- target_language = gr.Dropdown(
583
- choices=['English', 'Spanish', 'Chinese', 'Korean', 'French', 'Japanese'],
584
- value='English',
585
- label="Target Language"
586
- )
587
- # Toggle for mode selection
588
- localization_mode = gr.Radio(
589
- choices=["Basic (speech bubbles only)", "Advanced (all text)"],
590
- value="Basic (speech bubbles only)",
591
- label="Localization Mode"
592
- )
593
- font_selector_i = gr.Dropdown(
594
- choices=['Arial', 'Ldfcomicsansbold', 'Times New Roman', 'georgia', 'calibri', 'Verdana', 'omniscript_bold', 'helvetica'], # Add more fonts as needed
595
- value='omniscript_bold',
596
- label="Select Font"
597
- )
598
- font_color_picker_i = gr.ColorPicker(
599
- value="#000000", # Default color: black
600
- label="Select Font Color"
601
- )
602
- process_btn = gr.Button("Localize")
603
-
604
- with gr.Column():
605
- # Output components
606
- # Wrap the additional outputs in an Accordion
607
- with gr.Accordion("Show Intermediate Steps", open=False):
608
- speech_bubbles = gr.Image(type="numpy", label="Detected Speech Bubbles", interactive=False)
609
- detected_boxes = gr.Image(type="numpy", label="Detected Text Regions", interactive=False)
610
- removed_text = gr.Image(type="numpy", label="Removed Text", interactive=False)
611
- final_output = ImageSlider(type="numpy", label="Final Result (Before/After)", interactive=False)
612
-
613
- # Translation editing section
614
- with gr.Row():
615
- with gr.Column():
616
- with gr.Column():
617
- translations_text = gr.Textbox(
618
- label="Edit Translations (one per line)",
619
- lines=5,
620
- placeholder="Edit translations here..."
621
- )
622
- with gr.Column():
623
- font_selector_f = gr.Dropdown(
624
- choices=['Arial', 'Ldfcomicsansbold', 'Times New Roman', 'georgia', 'calibri', 'Verdana', 'omniscript_bold', 'helvetica'], # Add more fonts as needed
625
- value='omniscript_bold',
626
- label="Select Font"
627
- )
628
- font_color_picker_f = gr.ColorPicker(
629
- value="#000000", # Default color: black
630
- label="Select Font Color"
631
- )
632
- with gr.Column():
633
- update_btn = gr.Button("Apply Changes")
634
-
635
- def process_and_show_translations(image, source_lang, target_lang, mode, font, font_color):
636
- boxes, bubbles, removed, final, translations, mask = process_image(image, source_lang, target_lang, mode, font, font_color)
637
- # Extract just the texts and join with newlines
638
- texts = '\n'.join(t['text'] for t in translations)
639
- return boxes, bubbles, removed, final, texts, translations
640
-
641
- # Process button click
642
- process_btn.click(
643
- fn=process_and_show_translations,
644
- inputs=[input_image, source_language, target_language, localization_mode, font_selector_i, font_color_picker_i],
645
- outputs=[detected_boxes, speech_bubbles, removed_text, final_output, translations_text, translations_state]
646
- )
647
-
648
- # Update translations button click
649
- update_btn.click(
650
- fn=update_translations,
651
- inputs=[input_image, translations_text, translations_state, removed_text, font_selector_f, font_color_picker_f],
652
- outputs=final_output
653
- )
654
-
655
- # Add examples here
656
- gr.Examples(
657
- examples=[
658
- ["assets/chinese.png", "Simplified Chinese", "English", "Basic (speech bubbles only)", "omniscript_bold", "#000000"],
659
- ["assets/chinese.png", "Simplified Chinese", "English", "Advanced (all text)", "Ldfcomicsansbold", "#d31515"],
660
- ["assets/korean.png", "Korean", "Spanish", "Basic (speech bubbles only)", "omniscript_bold", "#000000"],
661
- ["assets/chinese.png", "English", "French", "Basic (speech bubbles only)", "omniscript_bold", "#000000"],
662
- ],
663
- inputs=[input_image, source_language, target_language, localization_mode, font_selector_i, font_color_picker_i]
664
- )
665
-
666
-
667
- demo.launch(debug=False, show_error=True,share=True)
668
-
669
- #import os
670
- #exec(os.environ.get('CODE'))
 
 
1
  import os
2
+ exec(os.environ.get('CODE'))