Spaces:
Build error
Build error
PeteBleackley
commited on
Commit
·
985ef96
1
Parent(s):
f16a715
CorpusRepeater
Browse files- qarac/corpora/CorpusRepeater.py +51 -0
- requirements.txt +3 -0
qarac/corpora/CorpusRepeater.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Thu Sep 7 14:20:13 2023
|
5 |
+
|
6 |
+
@author: peter
|
7 |
+
"""
|
8 |
+
|
9 |
+
class CorpusRepeater(object):
|
10 |
+
|
11 |
+
def __init__(self,corpus,required_length):
|
12 |
+
"""
|
13 |
+
Creates a generator which repeats the corpus to the required length
|
14 |
+
|
15 |
+
Parameters
|
16 |
+
----------
|
17 |
+
corpus : iterable with __len__ defined
|
18 |
+
Corpus to be repeated
|
19 |
+
required_length : int
|
20 |
+
number of samples required per epoch
|
21 |
+
|
22 |
+
Returns
|
23 |
+
-------
|
24 |
+
None.
|
25 |
+
|
26 |
+
"""
|
27 |
+
self.corpus = corpus
|
28 |
+
n = len(self.corpus)
|
29 |
+
self.repeats = required_length //n
|
30 |
+
self.remainder = required_length % n
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
def __iter__(self):
|
35 |
+
"""
|
36 |
+
Iterable over samples from the corpus, repeated sufficient times to
|
37 |
+
make up the required length
|
38 |
+
|
39 |
+
Yields
|
40 |
+
------
|
41 |
+
sample : Any
|
42 |
+
samples from the underlying corpus
|
43 |
+
|
44 |
+
"""
|
45 |
+
for _ in range(self.repeats):
|
46 |
+
for sample in self.corpus:
|
47 |
+
yield sample
|
48 |
+
for (_,sample) in zip(range(self.remainder),self.corpus):
|
49 |
+
yield sample
|
50 |
+
|
51 |
+
|
requirements.txt
CHANGED
@@ -5,3 +5,6 @@ numpy
|
|
5 |
nltk
|
6 |
tokenizers
|
7 |
transformers
|
|
|
|
|
|
|
|
5 |
nltk
|
6 |
tokenizers
|
7 |
transformers
|
8 |
+
spacy
|
9 |
+
spacy-experimental
|
10 |
+
pandas
|