File size: 4,671 Bytes
7cdf421 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import os
class DatasetCatalog:
def __init__(self):
# the following dataset utilized for encoding-side alignment learning
self.audiocap_enc = {
"target": "dataset.audiocap_dataset.AudioCapDataset",
"params": dict(
data_path="../data/T-X_pair_data/audiocap/audiocap.json",
mm_root_path="../data/T-X_pair_data/audiocap/audios",
embed_path="../data/embed/",
dataset_type="AudioToText",
),
}
self.webvid_enc = {
"target": "dataset.webvid_dataset.WebvidDataset",
"params": dict(
data_path="../data/T-X_pair_data/webvid/webvid.json",
mm_root_path="../data/T-X_pair_data/webvid/videos",
embed_path="../data/embed/",
dataset_type="VideoToText",
),
}
self.cc3m_enc = {
"target": "dataset.cc3m_dataset.CC3MDataset",
"params": dict(
data_path="../data/T-X_pair_data/cc3m/cc3m.json",
mm_root_path="../data/T-X_pair_data/cc3m/images",
embed_path="../data/embed/",
dataset_type="ImageToText",
),
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# the following dataset utilized for decoding-side alignment learning.
self.audiocap_dec = {
"target": "dataset.audiocap_dataset.AudioCapDataset",
"params": dict(
data_path="../data/T-X_pair_data/audiocap/audiocap.json",
mm_root_path="../data/T-X_pair_data/audiocap/audios",
embed_path="../data/embed/",
dataset_type="TextToAudio",
),
}
self.webvid_dec = {
"target": "dataset.webvid_dataset.WebvidDataset",
"params": dict(
data_path="../data/T-X_pair_data/webvid/webvid.json",
mm_root_path="../data/T-X_pair_data/webvid/videos",
embed_path="../data/embed/",
dataset_type="TextToVideo",
),
}
self.cc3m_dec = {
"target": "dataset.cc3m_dataset.CC3MDataset",
"params": dict(
data_path="../data/T-X_pair_data/cc3m/cc3m.json",
mm_root_path="../data/T-X_pair_data/cc3m/images",
embed_path="../data/embed/",
dataset_type="TextToImage",
),
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# the following dataset utilized for instruction tuning, so they are instruction dataset.
self.audio_instruction = {
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
"params": dict(
data_path="../data/IT_data/T-T+X_data/audio_t2x.json",
embed_path="./embed/",
dataset_type="TextToAudio",
),
}
self.video_instruction = {
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
"params": dict(
data_path="../data/IT_data/T-T+X_data/video_t2x.json",
embed_path="./embed/",
dataset_type="TextToVideo",
),
}
self.image_instruction = {
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
"params": dict(
data_path="../data/IT_data/T-T+X_data/image_t2x.json",
embed_path="./embed/",
dataset_type="TextToImage",
),
}
self.llava_instruction = {
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
"params": dict(
data_path="../data/IT_data/T+X-T_data/llava/llava.json",
mm_root_path="../data/IT_data/T+X-T_data/llava/images",
dataset_type="ImageToText",
),
}
self.alpaca_instruction = {
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
"params": dict(
data_path="../data/IT_data/T+X-T_data/alpaca/alpaca.json",
dataset_type="TextToText",
),
}
self.videochat_instruction = {
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
"params": dict(
data_path="../data/IT_data/T+X-T_data/videochat/videochat.json",
dataset_type="VideoToText",
),
}
|