# Copyright 2024 Big Vision Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# pylint: disable=line-too-long
r"""Generates GQA in a TFDS-ready structure, using Beam.

Instructions below are to generate the dataset with a *local* Beam pipeline.
It's advisable to run the Beam job on Google Cloud Dataflow, see
  https://www.tensorflow.org/datasets/beam_datasets.
for more details, which would significantly speed up generation. This would
involve uploading the locally downloaded data to a GCS bucket, and then
adding in the Beam pipeline options and your GCP/GCS bucket details
to the `tfds build` command below (as detailed in the link).

First, copy the data to local disk:

  mkdir -p /tmp/data/gqa
  wget -O /tmp/data/gqa/question1.2.zip https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip?download=true
  unzip /tmp/data/gqa/question1.2.zip
  mv /tmp/data/gqa/question1.2/* /tmp/data/gqa/
  wget -O /tmp/data/gqa/images.zip https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip?download=true
  unzip /tmp/data/gqa/images.zip

Then, run conversion (make sure to install tensorflow-datasets for the `tfds` util):

  cd big_vision/datasets
  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=gqa

Example to load:

  import tensorflow_datasets as tfds
  dataset = tfds.load('gqa', split='testdev_balanced', data_dir='/tmp/tfds')

Some statistics:
  train_all: 14305356 examples
  train_balanced: 943000 examples
  val_all: 2011853 examples
  val_balanced: 132062 examples
  testdev_all: 172174 examples
  testdev_balanced: 12578 examples
"""
import glob
import json
import os

import numpy as np
import tensorflow_datasets as tfds


_DESCRIPTION = """GQA: Visual Reasoning in the Real World."""

# pylint: disable=line-too-long
_CITATION = """
@article{DBLP:journals/corr/abs-2306-14610,
  author       = {Drew Hudson and
                  Christopher Manning},
  title        = {GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering},
  journal      = {CVPR},
  volume       = {abs/1902.09506},
  year         = {2019},
  url          = {https://doi.org/10.48550/arXiv.1902.09506},
  doi          = {10.48550/arXiv.1902.09506},
  eprinttype    = {arXiv},
  eprint       = {1902.09506},
  timestamp    = {Tue, 25 Jun 2019 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1902-09506},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
"""
# pylint: enable=line-too-long


_DATA_PATH = '/tmp/data/gqa/'


class GQA(tfds.core.GeneratorBasedBuilder):
  """DatasetBuilder for GQA dataset."""

  VERSION = tfds.core.Version('1.0.0')
  RELEASE_NOTES = {'1.0.0': 'First release.'}

  def _info(self):
    """Returns the metadata."""

    return tfds.core.DatasetInfo(
        builder=self,
        description=_DESCRIPTION,
        features=tfds.features.FeaturesDict({
            'example_id': tfds.features.Scalar(np.int64),
            'image/id': tfds.features.Text(),
            'image': tfds.features.Image(encoding_format='jpeg'),
            'question': tfds.features.Text(),
            'answer': tfds.features.Text(),
            'full_answer': tfds.features.Text(),
            'is_balanced': tfds.features.Scalar(np.bool_),
        }),
        homepage='https://cs.stanford.edu/people/dorarad/gqa/',
        citation=_CITATION,
    )

  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
    """Returns SplitGenerators."""
    splits = [
        # 'debug',
        'train_all',
        'train_balanced',
        'testdev_all',
        'testdev_balanced',
        'val_all',
        'val_balanced',
        'challenge_all',
        'challenge_balanced',
    ]
    return {split: self._generate_examples(split) for split in splits}

  def _generate_examples(self, split: str):
    """Yields (key, example) tuples from dataset."""
    if split == 'train_all':
      train_json_dir = os.path.join(_DATA_PATH, 'train_all_questions', '*.json')
      json_files = glob.glob(train_json_dir)
    else:
      json_files = [os.path.join(_DATA_PATH, f'{split}_questions.json')]

    def _prepare_data(json_path):
      with open(os.path.join(json_path)) as f:
        annotations = json.load(f)
      return [(k, v) for k, v in annotations.items()]

    def _process_example(entry):
      question_id, question_data = entry
      image_id = question_data['imageId']
      image_path = os.path.join(_DATA_PATH, 'images', f'{image_id}.jpg')
      answer = question_data['answer'] if 'answer' in question_data else ''
      if 'fullAnswer' in question_data:
        full_answer = question_data['fullAnswer']
      else:
        full_answer = ''

      example = {
          'example_id': question_id,
          'image/id': image_id,
          'image': image_path,
          'question': question_data['question'],
          'answer': answer,
          'full_answer': full_answer,
          'is_balanced': question_data['isBalanced'],
      }
      return question_id, example

    beam = tfds.core.lazy_imports.apache_beam
    return (
        beam.Create(json_files)
        | beam.FlatMap(_prepare_data)
        | beam.Reshuffle()
        | beam.Map(_process_example)
    )