File size: 5,257 Bytes

74e8f2f

# Copyright 2024 Big Vision Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# pylint: disable=line-too-long
r"""Import widgetcap into TFDS format.

  Widget Captioning all requires images from the RICO dataset:
  mkdir -p /tmp/data/rico_images ; cd /tmp/data/rico_images
  wget
  https://storage.googleapis.com/crowdstf-rico-uiuc-4540/rico_dataset_v0.1/unique_uis.tar.gz
  tar xvfz unique_uis.tar.gz
  rm unique_uis.tar.gz

  Widget Captioning:
  mkdir - /tmp/data/widget_captioning ; cd /tmp/data/widget_captioning
  git clone https://github.com/google-research-datasets/widget-caption.git
  cp widget-caption/widget_captions.csv ./
  cp widget-caption/split/*.txt ./
  rm -rf widget-caption

Then, run conversion locally (make sure to install tensorflow-datasets for the
`tfds` util):

  cd big_vision/datasets
  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=widgetcap

Example to load:

  import tensorflow_datasets as tfds
  dataset_augmented = tfds.load('widgetcap', split='train',
  data_dir='/tmp/tfds')
"""
import csv
import json
import os

import numpy as np
from PIL import Image
import tensorflow_datasets as tfds

_DATASET_DIR = '/tmp/data/widget_captioning'
# Dataset property indicating the y-dim of the canvas
_RICO_CANVAS_Y = 2560
_IMAGE_DIR = '/tmp/data/rico_images/combined'

_CITATION = (
    '@inproceedings{Li2020WidgetCG,title={Widget Captioning: Generating Natural'
    ' Language Description for MobileUser Interface Elements},author={Y. Li and'
    ' Gang Li and Luheng He and Jingjie Zheng and Hong Li and Zhiwei'
    ' Guan},booktitle={Conference on Empirical Methods in Natural Language'
    ' Processing},year={2020},}'
)


class Widgetcap(tfds.core.GeneratorBasedBuilder):
  """DatasetBuilder for widgetcap dataset."""

  VERSION = tfds.core.Version('1.0.0')
  RELEASE_NOTES = {'1.0.0': 'Format as needed for PaliGemma'}

  def _info(self) -> tfds.core.DatasetInfo:
    """Returns the metadata."""

    return tfds.core.DatasetInfo(
        builder=self,
        description='The widgetcap dataset.',
        features=tfds.features.FeaturesDict({
            'image/id': tfds.features.Text(),
            'image/filename': tfds.features.Text(),
            'image': tfds.features.Image(encoding_format='jpeg'),
            'texts': tfds.features.Sequence(tfds.features.Text()),
            'bbox': tfds.features.BBoxFeature(),
            'screen_id': tfds.features.Text(),
            'node_id': tfds.features.Text(),
            'height': np.int32,
            'width': np.int32,
        }),
        homepage='https://github.com/google-research-datasets/widget-caption',
        citation=_CITATION,
    )

  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
    """Returns SplitGenerators."""
    return {
        'train': self._generate_examples('train'),
        'dev': self._generate_examples('dev'),
        'test': self._generate_examples('test'),
    }

  def _generate_examples(self, split):
    """Yields (key, example) tuples from the dataset."""
    split_screen_ids = set()
    with open(os.path.join(_DATASET_DIR, split + '.txt')) as f:
      for line in f:
        split_screen_ids.add(line.strip())

    with open(os.path.join(_DATASET_DIR, 'widget_captions.csv')) as f:
      for row in csv.DictReader(f):
        if row['screenId'] in split_screen_ids:
          id_, example = self._get_example(
              row['screenId'], row['nodeId'], row['captions']
          )
          yield id_, example

  def _get_node_box(self, screen_id, node_id, height):
    index_list = [int(i) for i in node_id.split('.')[1:]]
    with open(os.path.join(_IMAGE_DIR, screen_id + '.json')) as f:
      view = json.load(f)
    curr_node = view['activity']['root']
    for index in index_list:
      curr_node = curr_node['children'][index]
    normalized_bounds = map(
        lambda x: x * height / _RICO_CANVAS_Y, curr_node['bounds']
    )
    return normalized_bounds

  def _get_example(self, screen_id, node_id, captions):
    image = Image.open(os.path.join(_IMAGE_DIR, screen_id + '.jpg'))
    width, height = image.size
    # get bounding box coordinates
    xmin, ymin, xmax, ymax = self._get_node_box(screen_id, node_id, height)

    image_id = f'{screen_id}_{node_id}'
    example = {
        'image/id': image_id,
        'image/filename': screen_id + '.jpg',
        'image': os.path.join(_IMAGE_DIR, screen_id + '.jpg'),
        'texts': captions.split('|'),
        'bbox': tfds.features.BBox(
            ymin=ymin / height,
            xmin=xmin / width,
            ymax=ymax / height,
            xmax=xmax / width,
        ),
        'screen_id': screen_id,
        'node_id': node_id,
        'height': height,
        'width': width,
    }
    return image_id, example