File size: 4,511 Bytes
19d93fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1045c52
 
 
 
 
 
 
 
 
 
 
 
19d93fe
ebac224
 
 
 
4ad8722
 
 
ebac224
 
4ad8722
1045c52
19d93fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ad8722
19d93fe
1045c52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ad8722
ebac224
 
4ad8722
 
 
ebac224
 
4ad8722
ebac224
 
4ad8722
 
 
ebac224
 
4ad8722
ebac224
 
4ad8722
 
 
ebac224
 
4ad8722
ebac224
 
1045c52
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
VIDORE_V1_MTEB_NAMES = [
    "VidoreArxivQARetrieval",
    "VidoreDocVQARetrieval",
    "VidoreInfoVQARetrieval",
    "VidoreShiftProjectRetrieval",
    "VidoreSyntheticDocQAAIRetrieval",
    "VidoreSyntheticDocQAEnergyRetrieval",
    "VidoreSyntheticDocQAGovernmentReportsRetrieval",
    "VidoreSyntheticDocQAHealthcareIndustryRetrieval",
    "VidoreTabfquadRetrieval",
    "VidoreTatdqaRetrieval",
]
VIDORE_V2_MTEB_NAMES = [
    "Vidore2BioMedicalLecturesRetrieval",
    "Vidore2EconomicsReportsRetrieval",
    "Vidore2ESGReportsHLRetrieval",
    "Vidore2ESGReportsRetrieval",
]
DEPRECATED_VIDORE_DATASETS_KEYWORDS = [
    "arxivqa",
    "docvqa",
    "infovqa",
    "tabfquad",
    "tatdqa",
    "shift",
    "artificial_intelligence",
    "energy",
    "government_reports",
    "healthcare_industry",
]

DEPRECATED_VIDORE_2_DATASETS_KEYWORDS = [
    "restaurant_esg",
    "rse_restaurant",
    "mit_biomedical",
    "economics_macro",
    "biomedical_lectures",
    "esg_reports",
    "economics_reports",
]


def get_datasets_nickname(dataset_name) -> str:
    if dataset_name == "VidoreArxivQARetrieval":
        return "ArxivQA"
    elif dataset_name == "VidoreDocVQARetrieval":
        return "DocVQA"
    elif dataset_name == "VidoreInfoVQARetrieval":
        return "InfoVQA"
    elif dataset_name == "VidoreTabfquadRetrieval":
        return "TabFQuad"
    elif dataset_name == "VidoreTatdqaRetrieval":
        return "TAT-DQA"
    elif dataset_name == "VidoreShiftProjectRetrieval":
        return "Shift Project"
    elif dataset_name == "VidoreSyntheticDocQAAIRetrieval":
        return "Artificial Intelligence"
    elif dataset_name == "VidoreSyntheticDocQAEnergyRetrieval":
        return "Energy"
    elif dataset_name == "VidoreSyntheticDocQAGovernmentReportsRetrieval":
        return "Government Reports"
    elif dataset_name == "VidoreSyntheticDocQAHealthcareIndustryRetrieval":
        return "Healthcare Industry"

    elif dataset_name == "Vidore2ESGReportsHLRetrieval":
        return "ESG Restaurant Human English"
    elif dataset_name == "Vidore2ESGReportsRetrieval":
        return "ESG Restaurant Synthetic Multilingual"
    elif dataset_name == "Vidore2BioMedicalLecturesRetrieval":
        return "MIT Biomedical Multilingual"
    elif dataset_name == "Vidore2EconomicsReportsRetrieval":
        return "Economics Macro Multilingual"

    else:
        raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")


def deprecated_get_datasets_nickname(dataset_name) -> str:
    if "arxivqa" in dataset_name:
        return "ArxivQA"

    elif "docvqa" in dataset_name:
        return "DocVQA"
    elif "infovqa" in dataset_name:
        return "InfoVQA"
    elif "tabfquad" in dataset_name:
        return "TabFQuad"

    elif "tatdqa" in dataset_name:
        return "TAT-DQA"

    elif "shift" in dataset_name:
        return "Shift Project"

    elif "artificial_intelligence" in dataset_name:
        return "Artificial Intelligence"

    elif "energy" in dataset_name:
        return "Energy"

    elif "government_reports" in dataset_name:
        return "Government Reports"

    elif "healthcare_industry" in dataset_name:
        return "Healthcare Industry"

    elif ("restaurant_esg" in dataset_name) or ("esg_reports_human" in dataset_name):
        return "ESG Restaurant Human"

    elif ("rse_restaurant" in dataset_name and "multilingual" in dataset_name) or (
        "esg_reports" in dataset_name and not "_eng_" in dataset_name
    ):
        return "ESG Restaurant Synthetic Multilingual"

    elif ("rse_restaurant" in dataset_name) or ("esg_reports" in dataset_name and "_eng_" in dataset_name):
        return "ESG Restaurant Synthetic"

    elif ("mit_biomedical" in dataset_name and "multilingual" in dataset_name) or (
        "biomedical_lectures" in dataset_name and not "_eng_" in dataset_name
    ):
        return "MIT Biomedical Multilingual"

    elif ("mit_biomedical" in dataset_name) or ("biomedical_lectures" in dataset_name and "_eng_" in dataset_name):
        return "MIT Biomedical"

    elif ("economics_macro" in dataset_name and "multilingual" in dataset_name) or (
        "economics_reports" in dataset_name and not "_eng_" in dataset_name
    ):
        return "Economics Macro Multilingual"

    elif ("economics_macro" in dataset_name) or ("economics_reports" in dataset_name and "_eng_" in dataset_name):
        return "Economics Macro"

    else:
        raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")