pranavSIT
/

PaliOpenVocabSegmentation

Model card Files Files and versions Community

PaliOpenVocabSegmentation / big_vision /datasets /widgetcap /widgetcap.py

pranavSIT

added pali inference

74e8f2f 11 months ago

raw

history blame contribute delete

5.26 kB

	# Copyright 2024 Big Vision Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# pylint: disable=line-too-long
	r"""Import widgetcap into TFDS format.

	Widget Captioning all requires images from the RICO dataset:
	mkdir -p /tmp/data/rico_images ; cd /tmp/data/rico_images
	wget
	https://storage.googleapis.com/crowdstf-rico-uiuc-4540/rico_dataset_v0.1/unique_uis.tar.gz
	tar xvfz unique_uis.tar.gz
	rm unique_uis.tar.gz

	Widget Captioning:
	mkdir - /tmp/data/widget_captioning ; cd /tmp/data/widget_captioning
	git clone https://github.com/google-research-datasets/widget-caption.git
	cp widget-caption/widget_captions.csv ./
	cp widget-caption/split/*.txt ./
	rm -rf widget-caption

	Then, run conversion locally (make sure to install tensorflow-datasets for the
	`tfds` util):

	cd big_vision/datasets
	env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=widgetcap

	Example to load:

	import tensorflow_datasets as tfds
	dataset_augmented = tfds.load('widgetcap', split='train',
	data_dir='/tmp/tfds')
	"""
	import csv
	import json
	import os

	import numpy as np
	from PIL import Image
	import tensorflow_datasets as tfds

	_DATASET_DIR = '/tmp/data/widget_captioning'
	# Dataset property indicating the y-dim of the canvas
	_RICO_CANVAS_Y = 2560
	_IMAGE_DIR = '/tmp/data/rico_images/combined'

	_CITATION = (
	'@inproceedings{Li2020WidgetCG,title={Widget Captioning: Generating Natural'
	' Language Description for MobileUser Interface Elements},author={Y. Li and'
	' Gang Li and Luheng He and Jingjie Zheng and Hong Li and Zhiwei'
	' Guan},booktitle={Conference on Empirical Methods in Natural Language'
	' Processing},year={2020},}'
	)


	class Widgetcap(tfds.core.GeneratorBasedBuilder):
	"""DatasetBuilder for widgetcap dataset."""

	VERSION = tfds.core.Version('1.0.0')
	RELEASE_NOTES = {'1.0.0': 'Format as needed for PaliGemma'}

	def _info(self) -> tfds.core.DatasetInfo:
	"""Returns the metadata."""

	return tfds.core.DatasetInfo(
	builder=self,
	description='The widgetcap dataset.',
	features=tfds.features.FeaturesDict({
	'image/id': tfds.features.Text(),
	'image/filename': tfds.features.Text(),
	'image': tfds.features.Image(encoding_format='jpeg'),
	'texts': tfds.features.Sequence(tfds.features.Text()),
	'bbox': tfds.features.BBoxFeature(),
	'screen_id': tfds.features.Text(),
	'node_id': tfds.features.Text(),
	'height': np.int32,
	'width': np.int32,
	}),
	homepage='https://github.com/google-research-datasets/widget-caption',
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager: tfds.download.DownloadManager):
	"""Returns SplitGenerators."""
	return {
	'train': self._generate_examples('train'),
	'dev': self._generate_examples('dev'),
	'test': self._generate_examples('test'),
	}

	def _generate_examples(self, split):
	"""Yields (key, example) tuples from the dataset."""
	split_screen_ids = set()
	with open(os.path.join(_DATASET_DIR, split + '.txt')) as f:
	for line in f:
	split_screen_ids.add(line.strip())

	with open(os.path.join(_DATASET_DIR, 'widget_captions.csv')) as f:
	for row in csv.DictReader(f):
	if row['screenId'] in split_screen_ids:
	id_, example = self._get_example(
	row['screenId'], row['nodeId'], row['captions']
	)
	yield id_, example

	def _get_node_box(self, screen_id, node_id, height):
	index_list = [int(i) for i in node_id.split('.')[1:]]
	with open(os.path.join(_IMAGE_DIR, screen_id + '.json')) as f:
	view = json.load(f)
	curr_node = view['activity']['root']
	for index in index_list:
	curr_node = curr_node['children'][index]
	normalized_bounds = map(
	lambda x: x * height / _RICO_CANVAS_Y, curr_node['bounds']
	)
	return normalized_bounds

	def _get_example(self, screen_id, node_id, captions):
	image = Image.open(os.path.join(_IMAGE_DIR, screen_id + '.jpg'))
	width, height = image.size
	# get bounding box coordinates
	xmin, ymin, xmax, ymax = self._get_node_box(screen_id, node_id, height)

	image_id = f'{screen_id}_{node_id}'
	example = {
	'image/id': image_id,
	'image/filename': screen_id + '.jpg',
	'image': os.path.join(_IMAGE_DIR, screen_id + '.jpg'),
	'texts': captions.split('\|'),
	'bbox': tfds.features.BBox(
	ymin=ymin / height,
	xmin=xmin / width,
	ymax=ymax / height,
	xmax=xmax / width,
	),
	'screen_id': screen_id,
	'node_id': node_id,
	'height': height,
	'width': width,
	}
	return image_id, example