Add data from "Documenting Large Webtext Corpora: A Case Study on the Colossal Clean Crawled Corpus"

#6
Files changed (1) hide show
  1. contamination_report.csv +22 -0
contamination_report.csv CHANGED
@@ -1,5 +1,27 @@
1
  Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  UCLNLP/adversarial_qa;adversarialQA;allenai/c4;corpus;;;0.03;data-based;https://arxiv.org/abs/2310.20707;2
5
  UCLNLP/adversarial_qa;adversarialQA;oscar-corpus/OSCAR-2301;corpus;;;0.03;data-based;https://arxiv.org/abs/2310.20707;2
 
1
  Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
2
 
3
+ lama;T-REx;allenai/c4;corpus;;;4.6;data-based;https://arxiv.org/abs/2104.08758;6
4
+ lama;Google-RE;allenai/c4;corpus;;;5.7;data-based;https://arxiv.org/abs/2104.08758;6
5
+ EdinburghNLP/xsum;;allenai/c4;corpus;;;15.49;data-based;https://arxiv.org/abs/2104.08758;6
6
+ reddit_tifu;short;allenai/c4;;;24.88;data-based;https://arxiv.org/abs/2104.08758;6
7
+ reddit_tifu;long;allenai/c4;;;1.87;data-based;https://arxiv.org/abs/2104.08758;6
8
+ wiki_bio;;allenai/c4;;;3.72;data-based;https://arxiv.org/abs/2104.08758;6
9
+ AMR-to-Text;;allenai/c4;;;10.43;data-based;https://arxiv.org/abs/2104.08758;6
10
+ nyu-mll/glue;BoolQ;allenai/c4;;;2.4;data-based;https://arxiv.org/abs/2104.08758;6
11
+ nyu-mll/glue;CoLA;allenai/c4;;;14.4;data-based;https://arxiv.org/abs/2104.08758;6
12
+ nyu-mll/glue;MNLI-hypothesis;allenai/c4;;;14.2;data-based;https://arxiv.org/abs/2104.08758;6
13
+ nyu-mll/glue;MNLI-premise;allenai/c4;;;15.2;data-based;https://arxiv.org/abs/2104.08758;6
14
+ nyu-mll/glue;MRPC-sentence-1;allenai/c4;;;2.7;data-based;https://arxiv.org/abs/2104.08758;6
15
+ nyu-mll/glue;MRPC-sentence-2;allenai/c4;;;2.7;data-based;https://arxiv.org/abs/2104.08758;6
16
+ nyu-mll/glue;QNLI-sentence;allenai/c4;;;53.6;data-based;https://arxiv.org/abs/2104.08758;6
17
+ nyu-mll/glue;QNLI-question;allenai/c4;;;1.8;data-based;https://arxiv.org/abs/2104.08758;6
18
+ nyu-mll/glue;RTE-sentence-1;allenai/c4;;;6.0;data-based;https://arxiv.org/abs/2104.08758;6
19
+ nyu-mll/glue;RTE-sentence-2;allenai/c4;;;10.8;data-based;https://arxiv.org/abs/2104.08758;6
20
+ nyu-mll/glue;SST-2;allenai/c4;;;11.0;data-based;https://arxiv.org/abs/2104.08758;6
21
+ nyu-mll/glue;STS-B-sentence-1;allenai/c4;;;18.3;data-based;https://arxiv.org/abs/2104.08758;6
22
+ nyu-mll/glue;STS-B-sentence-2;allenai/c4;;;18.6;data-based;https://arxiv.org/abs/2104.08758;6
23
+ nyu-mll/glue;WNLI-sentence-1;allenai/c4;;;4.8;data-based;https://arxiv.org/abs/2104.08758;6
24
+ nyu-mll/glue;WNLI-sentence-2;allenai/c4;;;2.1;data-based;https://arxiv.org/abs/2104.08758;6
25
 
26
  UCLNLP/adversarial_qa;adversarialQA;allenai/c4;corpus;;;0.03;data-based;https://arxiv.org/abs/2310.20707;2
27
  UCLNLP/adversarial_qa;adversarialQA;oscar-corpus/OSCAR-2301;corpus;;;0.03;data-based;https://arxiv.org/abs/2310.20707;2