Update README.md
Browse files
README.md
CHANGED
@@ -21,8 +21,6 @@ This model was created with two key goals:
|
|
21 |
1. Improved NER results on social media
|
22 |
2. Target only place names
|
23 |
|
24 |
-
_**NOTE:** There is a small bug with sub-words having incorrect BILUO tags. The following processing accounts for this._
|
25 |
-
|
26 |
## Use in `transformers`
|
27 |
|
28 |
```python
|
@@ -32,93 +30,23 @@ generator = pipeline(
|
|
32 |
task="ner",
|
33 |
model="cjber/reddit-ner-place_names",
|
34 |
tokenizer="cjber/reddit-ner-place_names",
|
|
|
35 |
)
|
36 |
|
37 |
out = generator("I live north of liverpool in Waterloo")
|
38 |
-
|
39 |
-
entities = [item["word"] for item in out]
|
40 |
-
labels = [item["entity"] for item in out]
|
41 |
-
```
|
42 |
-
|
43 |
-
Label idx values are required for the following stages:
|
44 |
-
|
45 |
-
```python
|
46 |
-
class Label:
|
47 |
-
labels: dict[str, int] = {
|
48 |
-
"O": 0,
|
49 |
-
"B-location": 1,
|
50 |
-
"I-location": 2,
|
51 |
-
"L-location": 3,
|
52 |
-
"U-location": 4,
|
53 |
-
}
|
54 |
-
|
55 |
-
idx: dict[int, str] = {v: k for k, v in labels.items()}
|
56 |
-
count: int = len(labels)
|
57 |
```
|
58 |
|
59 |
-
|
60 |
|
61 |
```python
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
if not tokens[-idx + 1].startswith("Ġ"):
|
73 |
-
tokens[-idx] = tokens[-idx] + tokens[-idx + 1]
|
74 |
-
subwords = [i for i, _ in enumerate(tokens) if tokens[i].startswith("Ġ")]
|
75 |
-
|
76 |
-
tags = [tags[i] for i in subwords]
|
77 |
-
tokens = [tokens[i][1:] for i in subwords]
|
78 |
-
tags_str: list[str] = [Label.idx[i] for i in tags]
|
79 |
-
return tokens, tags_str
|
80 |
-
|
81 |
-
|
82 |
-
names, labels = combine_subwords(entities, [Label.labels[lb] for lb in labels])
|
83 |
-
```
|
84 |
-
|
85 |
-
Combine BILUO tags:
|
86 |
-
|
87 |
-
```python
|
88 |
-
def combine_biluo(tokens: list[str], tags: list[str]) -> tuple[list[str], list[str]]:
|
89 |
-
tokens_biluo = tokens.copy()
|
90 |
-
tags_biluo = tags.copy()
|
91 |
-
|
92 |
-
for idx, tag in enumerate(tags_biluo):
|
93 |
-
if idx + 1 < len(tags_biluo) and tag[0] == "B":
|
94 |
-
i = 1
|
95 |
-
while tags_biluo[idx + i][0] not in ["B", "O", "U"]:
|
96 |
-
tokens_biluo[idx] = f"{tokens_biluo[idx]} {tokens_biluo[idx + i]}"
|
97 |
-
i += 1
|
98 |
-
if idx + i == len(tokens_biluo):
|
99 |
-
break
|
100 |
-
|
101 |
-
zipped = [
|
102 |
-
(token, tag)
|
103 |
-
for (token, tag) in zip(tokens_biluo, tags_biluo)
|
104 |
-
if tag[0] not in ["I", "L"]
|
105 |
-
]
|
106 |
-
if list(zipped):
|
107 |
-
tokens_biluo, tags_biluo = zip(*zipped)
|
108 |
-
tags_biluo = [tag[2:] if tag != "O" else tag for tag in tags_biluo]
|
109 |
-
return list(tokens_biluo), tags_biluo
|
110 |
-
else:
|
111 |
-
return [], []
|
112 |
-
|
113 |
-
names, labels = combine_biluo(names, labels)
|
114 |
-
```
|
115 |
-
|
116 |
-
This gives:
|
117 |
-
|
118 |
-
```python
|
119 |
-
>>> names
|
120 |
-
['liverpool', 'Waterloo']
|
121 |
-
|
122 |
-
>>> labels
|
123 |
-
['location', 'location']
|
124 |
```
|
|
|
21 |
1. Improved NER results on social media
|
22 |
2. Target only place names
|
23 |
|
|
|
|
|
24 |
## Use in `transformers`
|
25 |
|
26 |
```python
|
|
|
30 |
task="ner",
|
31 |
model="cjber/reddit-ner-place_names",
|
32 |
tokenizer="cjber/reddit-ner-place_names",
|
33 |
+
aggregation_strategy="simple",
|
34 |
)
|
35 |
|
36 |
out = generator("I live north of liverpool in Waterloo")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
```
|
38 |
|
39 |
+
Out gives:
|
40 |
|
41 |
```python
|
42 |
+
[{'entity_group': 'location',
|
43 |
+
'score': 0.94054973,
|
44 |
+
'word': ' liverpool',
|
45 |
+
'start': 16,
|
46 |
+
'end': 25},
|
47 |
+
{'entity_group': 'location',
|
48 |
+
'score': 0.99520856,
|
49 |
+
'word': ' Waterloo',
|
50 |
+
'start': 29,
|
51 |
+
'end': 37}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
```
|