|
import os |
|
|
|
|
|
class DatasetCatalog: |
|
def __init__(self): |
|
|
|
self.audiocap_enc = { |
|
"target": "dataset.audiocap_dataset.AudioCapDataset", |
|
"params": dict( |
|
data_path="../data/T-X_pair_data/audiocap/audiocap.json", |
|
mm_root_path="../data/T-X_pair_data/audiocap/audios", |
|
embed_path="../data/embed/", |
|
dataset_type="AudioToText", |
|
), |
|
} |
|
|
|
self.webvid_enc = { |
|
"target": "dataset.webvid_dataset.WebvidDataset", |
|
"params": dict( |
|
data_path="../data/T-X_pair_data/webvid/webvid.json", |
|
mm_root_path="../data/T-X_pair_data/webvid/videos", |
|
embed_path="../data/embed/", |
|
dataset_type="VideoToText", |
|
), |
|
} |
|
|
|
self.cc3m_enc = { |
|
"target": "dataset.cc3m_dataset.CC3MDataset", |
|
"params": dict( |
|
data_path="../data/T-X_pair_data/cc3m/cc3m.json", |
|
mm_root_path="../data/T-X_pair_data/cc3m/images", |
|
embed_path="../data/embed/", |
|
dataset_type="ImageToText", |
|
), |
|
} |
|
|
|
|
|
|
|
|
|
|
|
self.audiocap_dec = { |
|
"target": "dataset.audiocap_dataset.AudioCapDataset", |
|
"params": dict( |
|
data_path="../data/T-X_pair_data/audiocap/audiocap.json", |
|
mm_root_path="../data/T-X_pair_data/audiocap/audios", |
|
embed_path="../data/embed/", |
|
dataset_type="TextToAudio", |
|
), |
|
} |
|
|
|
self.webvid_dec = { |
|
"target": "dataset.webvid_dataset.WebvidDataset", |
|
"params": dict( |
|
data_path="../data/T-X_pair_data/webvid/webvid.json", |
|
mm_root_path="../data/T-X_pair_data/webvid/videos", |
|
embed_path="../data/embed/", |
|
dataset_type="TextToVideo", |
|
), |
|
} |
|
|
|
self.cc3m_dec = { |
|
"target": "dataset.cc3m_dataset.CC3MDataset", |
|
"params": dict( |
|
data_path="../data/T-X_pair_data/cc3m/cc3m.json", |
|
mm_root_path="../data/T-X_pair_data/cc3m/images", |
|
embed_path="../data/embed/", |
|
dataset_type="TextToImage", |
|
), |
|
} |
|
|
|
|
|
|
|
|
|
self.audio_instruction = { |
|
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset", |
|
"params": dict( |
|
data_path="../data/IT_data/T-T+X_data/audio_t2x.json", |
|
embed_path="./embed/", |
|
dataset_type="TextToAudio", |
|
), |
|
} |
|
|
|
self.video_instruction = { |
|
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset", |
|
"params": dict( |
|
data_path="../data/IT_data/T-T+X_data/video_t2x.json", |
|
embed_path="./embed/", |
|
dataset_type="TextToVideo", |
|
), |
|
} |
|
|
|
self.image_instruction = { |
|
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset", |
|
"params": dict( |
|
data_path="../data/IT_data/T-T+X_data/image_t2x.json", |
|
embed_path="./embed/", |
|
dataset_type="TextToImage", |
|
|
|
), |
|
} |
|
|
|
self.llava_instruction = { |
|
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset", |
|
"params": dict( |
|
data_path="../data/IT_data/T+X-T_data/llava/llava.json", |
|
mm_root_path="../data/IT_data/T+X-T_data/llava/images", |
|
dataset_type="ImageToText", |
|
), |
|
} |
|
|
|
self.alpaca_instruction = { |
|
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset", |
|
"params": dict( |
|
data_path="../data/IT_data/T+X-T_data/alpaca/alpaca.json", |
|
dataset_type="TextToText", |
|
), |
|
} |
|
|
|
self.videochat_instruction = { |
|
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset", |
|
"params": dict( |
|
data_path="../data/IT_data/T+X-T_data/videochat/videochat.json", |
|
dataset_type="VideoToText", |
|
), |
|
} |
|
|