PeteBleackley commited on
Commit
985ef96
·
1 Parent(s): f16a715

CorpusRepeater

Browse files
qarac/corpora/CorpusRepeater.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Thu Sep 7 14:20:13 2023
5
+
6
+ @author: peter
7
+ """
8
+
9
+ class CorpusRepeater(object):
10
+
11
+ def __init__(self,corpus,required_length):
12
+ """
13
+ Creates a generator which repeats the corpus to the required length
14
+
15
+ Parameters
16
+ ----------
17
+ corpus : iterable with __len__ defined
18
+ Corpus to be repeated
19
+ required_length : int
20
+ number of samples required per epoch
21
+
22
+ Returns
23
+ -------
24
+ None.
25
+
26
+ """
27
+ self.corpus = corpus
28
+ n = len(self.corpus)
29
+ self.repeats = required_length //n
30
+ self.remainder = required_length % n
31
+
32
+
33
+
34
+ def __iter__(self):
35
+ """
36
+ Iterable over samples from the corpus, repeated sufficient times to
37
+ make up the required length
38
+
39
+ Yields
40
+ ------
41
+ sample : Any
42
+ samples from the underlying corpus
43
+
44
+ """
45
+ for _ in range(self.repeats):
46
+ for sample in self.corpus:
47
+ yield sample
48
+ for (_,sample) in zip(range(self.remainder),self.corpus):
49
+ yield sample
50
+
51
+
requirements.txt CHANGED
@@ -5,3 +5,6 @@ numpy
5
  nltk
6
  tokenizers
7
  transformers
 
 
 
 
5
  nltk
6
  tokenizers
7
  transformers
8
+ spacy
9
+ spacy-experimental
10
+ pandas