jerome-white commited on
Commit
06d7446
1 Parent(s): a9490de

Citations without start and end indices

Browse files

It is not clear what start and end indices are. This commit updates
the citation logic to support removing those indices, and reporting
citations in a more traditional way. The logic is handled in a
collection of parser classes, which are encapsulated in the citation
manager.

Files changed (1) hide show
  1. mylib/_citations.py +78 -14
mylib/_citations.py CHANGED
@@ -1,20 +1,84 @@
1
- class CitationManager:
2
- def __init__(self, annotations, client, start=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  self.start = start
4
- self.body = {}
5
- self.citations = []
6
 
 
 
 
 
 
 
7
  for a in annotations:
8
- reference = f'[{start}]'
9
- self.body[a.text] = reference
10
- document = client.files.retrieve(a.file_citation.file_id)
11
- self.citations.append('{} {}:{}--{}'.format(
12
- reference,
13
- document.filename,
14
- a.start_index,
15
- a.end_index,
16
- ))
17
- start += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def __len__(self):
20
  return len(self.citations)
 
1
+ import collections as cl
2
+ from dataclasses import dataclass
3
+
4
+ #
5
+ #
6
+ #
7
+ @dataclass
8
+ class Citation:
9
+ text: str
10
+ refn: str
11
+ cite: str
12
+
13
+ def unique(values):
14
+ seen = set()
15
+ for v in values:
16
+ if v not in seen:
17
+ yield v
18
+ seen.add(v)
19
+
20
+ #
21
+ #
22
+ #
23
+ class CitationParser:
24
+ def __init__(self, client, start=1):
25
+ self.client = client
26
  self.start = start
 
 
27
 
28
+ def __next__(self):
29
+ value = f'[{self.start}]'
30
+ self.start += 1
31
+ return value
32
+
33
+ def __call__(self, annotations):
34
  for a in annotations:
35
+ document = self.client.files.retrieve(a.file_citation.file_id)
36
+ yield Citation(a.text, *self.extract(a, document.filename))
37
+
38
+ class StandardCitationParser(CitationParser):
39
+ def extract(self, annotation, document):
40
+ reference = next(self)
41
+ citation = '{} {}:{}--{}'.format(
42
+ reference,
43
+ document,
44
+ annotation.start_index,
45
+ annotation.end_index,
46
+ )
47
+
48
+ return (reference, citation)
49
+
50
+ class SimpleCitationParser(CitationParser):
51
+ def __init__(self, client, start=1):
52
+ super().__init__(client, start)
53
+ self.citations = {}
54
+
55
+ def extract(self, annotation, document):
56
+ if document in self.citations:
57
+ reference = self.citations[document]
58
+ else:
59
+ reference = next(self)
60
+ self.citations[document] = reference
61
+ citation = f'{reference} {document}'
62
+
63
+ return (reference, citation)
64
+
65
+ #
66
+ #
67
+ #
68
+ class CitationManager:
69
+ # _c_parser = StandardCitationParser
70
+ _c_parser = SimpleCitationParser
71
+
72
+ def __init__(self, annotations, client, start):
73
+ self.body = {}
74
+
75
+ c_parser = self._c_parser(client, start)
76
+ citations = []
77
+ for c in c_parser(annotations):
78
+ self.body[c.text] = c.refn
79
+ citations.append(c.cite)
80
+
81
+ self.citations = list(unique(citations))
82
 
83
  def __len__(self):
84
  return len(self.citations)