|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r"""Import widgetcap into TFDS format. |
|
|
|
Widget Captioning all requires images from the RICO dataset: |
|
mkdir -p /tmp/data/rico_images ; cd /tmp/data/rico_images |
|
wget |
|
https://storage.googleapis.com/crowdstf-rico-uiuc-4540/rico_dataset_v0.1/unique_uis.tar.gz |
|
tar xvfz unique_uis.tar.gz |
|
rm unique_uis.tar.gz |
|
|
|
Widget Captioning: |
|
mkdir - /tmp/data/widget_captioning ; cd /tmp/data/widget_captioning |
|
git clone https://github.com/google-research-datasets/widget-caption.git |
|
cp widget-caption/widget_captions.csv ./ |
|
cp widget-caption/split/*.txt ./ |
|
rm -rf widget-caption |
|
|
|
Then, run conversion locally (make sure to install tensorflow-datasets for the |
|
`tfds` util): |
|
|
|
cd big_vision/datasets |
|
env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=widgetcap |
|
|
|
Example to load: |
|
|
|
import tensorflow_datasets as tfds |
|
dataset_augmented = tfds.load('widgetcap', split='train', |
|
data_dir='/tmp/tfds') |
|
""" |
|
import csv |
|
import json |
|
import os |
|
|
|
import numpy as np |
|
from PIL import Image |
|
import tensorflow_datasets as tfds |
|
|
|
_DATASET_DIR = '/tmp/data/widget_captioning' |
|
|
|
_RICO_CANVAS_Y = 2560 |
|
_IMAGE_DIR = '/tmp/data/rico_images/combined' |
|
|
|
_CITATION = ( |
|
'@inproceedings{Li2020WidgetCG,title={Widget Captioning: Generating Natural' |
|
' Language Description for MobileUser Interface Elements},author={Y. Li and' |
|
' Gang Li and Luheng He and Jingjie Zheng and Hong Li and Zhiwei' |
|
' Guan},booktitle={Conference on Empirical Methods in Natural Language' |
|
' Processing},year={2020},}' |
|
) |
|
|
|
|
|
class Widgetcap(tfds.core.GeneratorBasedBuilder): |
|
"""DatasetBuilder for widgetcap dataset.""" |
|
|
|
VERSION = tfds.core.Version('1.0.0') |
|
RELEASE_NOTES = {'1.0.0': 'Format as needed for PaliGemma'} |
|
|
|
def _info(self) -> tfds.core.DatasetInfo: |
|
"""Returns the metadata.""" |
|
|
|
return tfds.core.DatasetInfo( |
|
builder=self, |
|
description='The widgetcap dataset.', |
|
features=tfds.features.FeaturesDict({ |
|
'image/id': tfds.features.Text(), |
|
'image/filename': tfds.features.Text(), |
|
'image': tfds.features.Image(encoding_format='jpeg'), |
|
'texts': tfds.features.Sequence(tfds.features.Text()), |
|
'bbox': tfds.features.BBoxFeature(), |
|
'screen_id': tfds.features.Text(), |
|
'node_id': tfds.features.Text(), |
|
'height': np.int32, |
|
'width': np.int32, |
|
}), |
|
homepage='https://github.com/google-research-datasets/widget-caption', |
|
citation=_CITATION, |
|
) |
|
|
|
def _split_generators(self, dl_manager: tfds.download.DownloadManager): |
|
"""Returns SplitGenerators.""" |
|
return { |
|
'train': self._generate_examples('train'), |
|
'dev': self._generate_examples('dev'), |
|
'test': self._generate_examples('test'), |
|
} |
|
|
|
def _generate_examples(self, split): |
|
"""Yields (key, example) tuples from the dataset.""" |
|
split_screen_ids = set() |
|
with open(os.path.join(_DATASET_DIR, split + '.txt')) as f: |
|
for line in f: |
|
split_screen_ids.add(line.strip()) |
|
|
|
with open(os.path.join(_DATASET_DIR, 'widget_captions.csv')) as f: |
|
for row in csv.DictReader(f): |
|
if row['screenId'] in split_screen_ids: |
|
id_, example = self._get_example( |
|
row['screenId'], row['nodeId'], row['captions'] |
|
) |
|
yield id_, example |
|
|
|
def _get_node_box(self, screen_id, node_id, height): |
|
index_list = [int(i) for i in node_id.split('.')[1:]] |
|
with open(os.path.join(_IMAGE_DIR, screen_id + '.json')) as f: |
|
view = json.load(f) |
|
curr_node = view['activity']['root'] |
|
for index in index_list: |
|
curr_node = curr_node['children'][index] |
|
normalized_bounds = map( |
|
lambda x: x * height / _RICO_CANVAS_Y, curr_node['bounds'] |
|
) |
|
return normalized_bounds |
|
|
|
def _get_example(self, screen_id, node_id, captions): |
|
image = Image.open(os.path.join(_IMAGE_DIR, screen_id + '.jpg')) |
|
width, height = image.size |
|
|
|
xmin, ymin, xmax, ymax = self._get_node_box(screen_id, node_id, height) |
|
|
|
image_id = f'{screen_id}_{node_id}' |
|
example = { |
|
'image/id': image_id, |
|
'image/filename': screen_id + '.jpg', |
|
'image': os.path.join(_IMAGE_DIR, screen_id + '.jpg'), |
|
'texts': captions.split('|'), |
|
'bbox': tfds.features.BBox( |
|
ymin=ymin / height, |
|
xmin=xmin / width, |
|
ymax=ymax / height, |
|
xmax=xmax / width, |
|
), |
|
'screen_id': screen_id, |
|
'node_id': node_id, |
|
'height': height, |
|
'width': width, |
|
} |
|
return image_id, example |
|
|