Spaces:
Runtime error
Runtime error
jerome-white
commited on
Commit
•
06d7446
1
Parent(s):
a9490de
Citations without start and end indices
Browse filesIt is not clear what start and end indices are. This commit updates
the citation logic to support removing those indices, and reporting
citations in a more traditional way. The logic is handled in a
collection of parser classes, which are encapsulated in the citation
manager.
- mylib/_citations.py +78 -14
mylib/_citations.py
CHANGED
@@ -1,20 +1,84 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
self.start = start
|
4 |
-
self.body = {}
|
5 |
-
self.citations = []
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
for a in annotations:
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def __len__(self):
|
20 |
return len(self.citations)
|
|
|
1 |
+
import collections as cl
|
2 |
+
from dataclasses import dataclass
|
3 |
+
|
4 |
+
#
|
5 |
+
#
|
6 |
+
#
|
7 |
+
@dataclass
|
8 |
+
class Citation:
|
9 |
+
text: str
|
10 |
+
refn: str
|
11 |
+
cite: str
|
12 |
+
|
13 |
+
def unique(values):
|
14 |
+
seen = set()
|
15 |
+
for v in values:
|
16 |
+
if v not in seen:
|
17 |
+
yield v
|
18 |
+
seen.add(v)
|
19 |
+
|
20 |
+
#
|
21 |
+
#
|
22 |
+
#
|
23 |
+
class CitationParser:
|
24 |
+
def __init__(self, client, start=1):
|
25 |
+
self.client = client
|
26 |
self.start = start
|
|
|
|
|
27 |
|
28 |
+
def __next__(self):
|
29 |
+
value = f'[{self.start}]'
|
30 |
+
self.start += 1
|
31 |
+
return value
|
32 |
+
|
33 |
+
def __call__(self, annotations):
|
34 |
for a in annotations:
|
35 |
+
document = self.client.files.retrieve(a.file_citation.file_id)
|
36 |
+
yield Citation(a.text, *self.extract(a, document.filename))
|
37 |
+
|
38 |
+
class StandardCitationParser(CitationParser):
|
39 |
+
def extract(self, annotation, document):
|
40 |
+
reference = next(self)
|
41 |
+
citation = '{} {}:{}--{}'.format(
|
42 |
+
reference,
|
43 |
+
document,
|
44 |
+
annotation.start_index,
|
45 |
+
annotation.end_index,
|
46 |
+
)
|
47 |
+
|
48 |
+
return (reference, citation)
|
49 |
+
|
50 |
+
class SimpleCitationParser(CitationParser):
|
51 |
+
def __init__(self, client, start=1):
|
52 |
+
super().__init__(client, start)
|
53 |
+
self.citations = {}
|
54 |
+
|
55 |
+
def extract(self, annotation, document):
|
56 |
+
if document in self.citations:
|
57 |
+
reference = self.citations[document]
|
58 |
+
else:
|
59 |
+
reference = next(self)
|
60 |
+
self.citations[document] = reference
|
61 |
+
citation = f'{reference} {document}'
|
62 |
+
|
63 |
+
return (reference, citation)
|
64 |
+
|
65 |
+
#
|
66 |
+
#
|
67 |
+
#
|
68 |
+
class CitationManager:
|
69 |
+
# _c_parser = StandardCitationParser
|
70 |
+
_c_parser = SimpleCitationParser
|
71 |
+
|
72 |
+
def __init__(self, annotations, client, start):
|
73 |
+
self.body = {}
|
74 |
+
|
75 |
+
c_parser = self._c_parser(client, start)
|
76 |
+
citations = []
|
77 |
+
for c in c_parser(annotations):
|
78 |
+
self.body[c.text] = c.refn
|
79 |
+
citations.append(c.cite)
|
80 |
+
|
81 |
+
self.citations = list(unique(citations))
|
82 |
|
83 |
def __len__(self):
|
84 |
return len(self.citations)
|