File size: 3,394 Bytes
a526622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import subprocess
import os
import subprocess
from PIL import Image, ImageDraw
import re
import json
import subprocess

def process_inference_results(results, process_image=False):
    """
    Process the inference results by:
    1. Adding bounding boxes on the image based on the coordinates in 'text'.
    2. Extracting and returning the text prompt.
    
    :param results: List of inference results with bounding boxes in 'text'.
    :return: (image, text)
    """
    processed_images = []
    extracted_texts = []

    for result in results:
        image_path = result['image_path']
        img = Image.open(image_path).convert("RGB")
        draw = ImageDraw.Draw(img)

        bbox_str = re.search(r'\[\[([0-9,\s]+)\]\]', result['text'])
        if bbox_str:
            bbox = [int(coord) for coord in bbox_str.group(1).split(',')]
            x1, y1, x2, y2 = bbox

            draw.rectangle([x1, y1, x2, y2], outline="red", width=3)

        extracted_texts.append(result['text'])

        processed_images.append(img)

    if process_image:
        return processed_images, extracted_texts

    return extracted_texts

def inference_and_run(image_path, prompt, conv_mode="ferret_gemma_instruct", model_path="jadechoghari/Ferret-UI-Gemma2b", box=None, process_image=False):
    """
    Run the inference and capture the errors for debugging.
    """
    data_input = [{
        "id": 0,
        "image": os.path.basename(image_path),
        "image_h": Image.open(image_path).height,
        "image_w": Image.open(image_path).width,
        "conversations": [{"from": "human", "value": f"<image>\n{prompt}"}]
    }]
    
    if box:
        data_input[0]["box_x1y1x2y2"] = [[box]]

    with open("eval.json", "w") as json_file:
        json.dump(data_input, json_file)
    
    print("eval.json file created successfully.")
    
    cmd = [
        "python", "-m", "model_UI", 
        "--model_path", model_path,
        "--data_path", "eval.json", 
        "--image_path", ".", 
        "--answers_file", "eval_output.jsonl", 
        "--num_beam", "1", 
        "--max_new_tokens", "32",
        "--conv_mode", conv_mode
    ]

    if box:
        cmd.extend(["--region_format", "box", "--add_region_feature"])

    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(f"Subprocess output:\n{result.stdout}")
        print(f"Subprocess error (if any):\n{result.stderr}")
        print(f"Inference completed. Output written to eval_output.jsonl")

        output_folder = 'eval_output.jsonl'
        if os.path.exists(output_folder):
            json_files = [f for f in os.listdir(output_folder) if f.endswith(".jsonl")]
            if json_files:
                output_file_path = os.path.join(output_folder, json_files[0])
                with open(output_file_path, "r") as output_file:
                    results = [json.loads(line) for line in output_file]
                
                return process_inference_results(results, process_image)
            else:
                print("No output JSONL files found.")
                return None, None
        else:
            print("Output folder not found.")
            return None, None

    except subprocess.CalledProcessError as e:
        print(f"Error occurred during inference:\n{e}")
        print(f"Subprocess output:\n{e.output}")
        return None, None