wesslen commited on
Commit
505dd8b
·
verified ·
1 Parent(s): ac58281

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
Dockerfile ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ #COPY requirements.txt /app/
4
+ WORKDIR /app
5
+ # # Set up a new user named "user" with user ID 1000
6
+ # RUN useradd -m -u 1000 user
7
+ # # Switch to the "user" user
8
+ # USER user
9
+ # # Set home to the user's home directory
10
+ # ENV HOME=/home/user \
11
+ # PATH=/home/user/.local/bin:$PATH
12
+
13
+ # # Set the working directory to the user's home directory
14
+ # WORKDIR $HOME/app
15
+
16
+ # # Copy the current directory contents into the container at $HOME/app setting the owner to the user
17
+ # COPY --chown=user . $HOME/app
18
+
19
+ RUN --mount=type=secret,id=LICENSE_KEY,mode=0444,required=true \
20
+ pip install --upgrade pip \
21
+ && pip install typing_extensions==4.5.0 \
22
+ && pip install --quiet prodigy -f https://$(cat /run/secrets/LICENSE_KEY)@download.prodi.gy
23
+
24
+ RUN chmod 777 .
25
+
26
+ COPY prodigy.json .
27
+ COPY data ./data/
28
+ COPY recipe.py .
29
+ COPY prodigy.sh .
30
+
31
+ ENV PRODIGY_HOME /app
32
+ ENV PRODIGY_LOGGING "verbose"
33
+ ENV MAX_SESSIONS 50
34
+
35
+ EXPOSE 7860
36
+
37
+ CMD ["bash","prodigy.sh"]
__pycache__/recipe.cpython-39.pyc ADDED
Binary file (9.29 kB). View file
 
data/dataset.jsonl ADDED
File without changes
data/known_answers_cooking.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"text":"Cauliflower leaves. Good in a curry, stew, stir fry. Only found out about it recently..","meta":{"explanation_label":"Cauliflower leaves is an ingredient since in the context it's used in the dishes. Curry, stew, and stir fry are dishes as they are the final product of the ingredients like cauliflower leaves.","explanation_boundaries":null},"_input_hash":-1856281439,"_task_hash":-1652827864,"tokens":[{"text":"Cauliflower","start":0,"end":11,"id":0,"ws":true},{"text":"leaves","start":12,"end":18,"id":1,"ws":false},{"text":".","start":18,"end":19,"id":2,"ws":true},{"text":"Good","start":20,"end":24,"id":3,"ws":true},{"text":"in","start":25,"end":27,"id":4,"ws":true},{"text":"a","start":28,"end":29,"id":5,"ws":true},{"text":"curry","start":30,"end":35,"id":6,"ws":false},{"text":",","start":35,"end":36,"id":7,"ws":true},{"text":"stew","start":37,"end":41,"id":8,"ws":false},{"text":",","start":41,"end":42,"id":9,"ws":true},{"text":"stir","start":43,"end":47,"id":10,"ws":true},{"text":"fry","start":48,"end":51,"id":11,"ws":false},{"text":".","start":51,"end":52,"id":12,"ws":true},{"text":"Only","start":53,"end":57,"id":13,"ws":true},{"text":"found","start":58,"end":63,"id":14,"ws":true},{"text":"out","start":64,"end":67,"id":15,"ws":true},{"text":"about","start":68,"end":73,"id":16,"ws":true},{"text":"it","start":74,"end":76,"id":17,"ws":true},{"text":"recently","start":77,"end":85,"id":18,"ws":false},{"text":"..","start":85,"end":87,"id":19,"ws":false}],"spans":[{"token_start":0,"token_end":1,"start":0,"end":18,"text":"Cauliflower leaves","label":"INGREDIENT"},{"token_start":6,"token_end":6,"start":30,"end":35,"text":"curry","label":"DISH"},{"token_start":8,"token_end":8,"start":37,"end":41,"text":"stew","label":"DISH"},{"token_start":10,"token_end":11,"start":43,"end":51,"text":"stir fry","label":"DISH"}],"llm":{"prompt":"You are an expert Named Entity Recognition (NER) system. Your task is to accept Text as input and extract named entities for the set of predefined entity labels.\nFrom the Text input provided, extract named entities for each label in the following format:\n\nDISH: <comma delimited list of strings>\nEQUIPMENT: <comma delimited list of strings>\nINGREDIENT: <comma delimited list of strings>\n\nBelow are definitions of each label to help aid you in what kinds of named entities to extract for each label.\nAssume these definitions are written by an expert and follow them closely.\n\nDISH: Extract the name of a known dish.\nINGREDIENT: Extract the name of a cooking ingredient, including herbs and spices.\nEQUIPMENT: Extract any mention of cooking equipment. e.g. oven, cooking pot, grill\n\n\nBelow are some examples (only use these as a guide):\n\nText:\n'''\nYou can probably sand-blast it if it's an anodized aluminum pan.\n'''\n\nEQUIPMENT: anodized aluminum pan\n\n\nHere is the text that needs labeling:\n\nText:\n'''\nCauliflower leaves. Good in a curry, stew, stir fry. Only found out about it recently..\n'''","response":"DISH: curry, stew, stir fry\nINGREDIENT: Cauliflower leaves\nEQUIPMENT:"},"_view_id":"ner_manual","_is_binary":false,"answer":"accept","_timestamp":1698173684,"_annotator_id":"2023-10-24_14-36-43","_session_id":"2023-10-24_14-36-43"}
2
+ {"text":"My Pro 500 is going on 18 years old. Thing is a tank. Just don't drop it on your toe","meta":{"explanation_label":"Pro 500 is a popular blender by KitchenAid.","explanation_boundaries":"Incorrect boundaries as it included 'My'."},"_input_hash":664573206,"_task_hash":-2138876470,"tokens":[{"text":"My","start":0,"end":2,"id":0,"ws":true},{"text":"Pro","start":3,"end":6,"id":1,"ws":true},{"text":"500","start":7,"end":10,"id":2,"ws":true},{"text":"is","start":11,"end":13,"id":3,"ws":true},{"text":"going","start":14,"end":19,"id":4,"ws":true},{"text":"on","start":20,"end":22,"id":5,"ws":true},{"text":"18","start":23,"end":25,"id":6,"ws":true},{"text":"years","start":26,"end":31,"id":7,"ws":true},{"text":"old","start":32,"end":35,"id":8,"ws":false},{"text":".","start":35,"end":36,"id":9,"ws":true},{"text":"Thing","start":37,"end":42,"id":10,"ws":true},{"text":"is","start":43,"end":45,"id":11,"ws":true},{"text":"a","start":46,"end":47,"id":12,"ws":true},{"text":"tank","start":48,"end":52,"id":13,"ws":false},{"text":".","start":52,"end":53,"id":14,"ws":true},{"text":"Just","start":54,"end":58,"id":15,"ws":true},{"text":"do","start":59,"end":61,"id":16,"ws":false},{"text":"n't","start":61,"end":64,"id":17,"ws":true},{"text":"drop","start":65,"end":69,"id":18,"ws":true},{"text":"it","start":70,"end":72,"id":19,"ws":true},{"text":"on","start":73,"end":75,"id":20,"ws":true},{"text":"your","start":76,"end":80,"id":21,"ws":true},{"text":"toe","start":81,"end":84,"id":22,"ws":false}],"spans":[{"token_start":1,"token_end":2,"start":3,"end":10,"text":"Pro 500","label":"EQUIPMENT"}],"llm":{"prompt":"You are an expert Named Entity Recognition (NER) system. Your task is to accept Text as input and extract named entities for the set of predefined entity labels.\nFrom the Text input provided, extract named entities for each label in the following format:\n\nDISH: <comma delimited list of strings>\nEQUIPMENT: <comma delimited list of strings>\nINGREDIENT: <comma delimited list of strings>\n\nBelow are definitions of each label to help aid you in what kinds of named entities to extract for each label.\nAssume these definitions are written by an expert and follow them closely.\n\nDISH: Extract the name of a known dish.\nINGREDIENT: Extract the name of a cooking ingredient, including herbs and spices.\nEQUIPMENT: Extract any mention of cooking equipment. e.g. oven, cooking pot, grill\n\n\nBelow are some examples (only use these as a guide):\n\nText:\n'''\nYou can probably sand-blast it if it's an anodized aluminum pan.\n'''\n\nEQUIPMENT: anodized aluminum pan\n\n\nHere is the text that needs labeling:\n\nText:\n'''\nMy Pro 500 is going on 18 years old. Thing is a tank. Just don't drop it on your toe\n'''","response":"EQUIPMENT: Pro 500"},"_view_id":"ner_manual","_is_binary":false,"answer":"accept","_timestamp":1698173153,"_annotator_id":"2023-10-24_14-36-43","_session_id":"2023-10-24_14-36-43"}
3
+ {"text":"1/2 cup butter, 1 cup parm, and 1/2 cup heavy cream. Optionally put a half clove of minced garlic in the butter.","meta":{"explanation_label":"All of the entities are used and described as ingredients (inputs to dishes).","explanation_boundaries":"Measurements like cups or cloves are not to be included. Adjectives like 'minced' or 'heavy' should be included to better describe the ingredients."},"_input_hash":-2038609053,"_task_hash":481198127,"tokens":[{"text":"1/2","start":0,"end":3,"id":0,"ws":true},{"text":"cup","start":4,"end":7,"id":1,"ws":true},{"text":"butter","start":8,"end":14,"id":2,"ws":false},{"text":",","start":14,"end":15,"id":3,"ws":true},{"text":"1","start":16,"end":17,"id":4,"ws":true},{"text":"cup","start":18,"end":21,"id":5,"ws":true},{"text":"parm","start":22,"end":26,"id":6,"ws":false},{"text":",","start":26,"end":27,"id":7,"ws":true},{"text":"and","start":28,"end":31,"id":8,"ws":true},{"text":"1/2","start":32,"end":35,"id":9,"ws":true},{"text":"cup","start":36,"end":39,"id":10,"ws":true},{"text":"heavy","start":40,"end":45,"id":11,"ws":true},{"text":"cream","start":46,"end":51,"id":12,"ws":false},{"text":".","start":51,"end":52,"id":13,"ws":true},{"text":"Optionally","start":53,"end":63,"id":14,"ws":true},{"text":"put","start":64,"end":67,"id":15,"ws":true},{"text":"a","start":68,"end":69,"id":16,"ws":true},{"text":"half","start":70,"end":74,"id":17,"ws":true},{"text":"clove","start":75,"end":80,"id":18,"ws":true},{"text":"of","start":81,"end":83,"id":19,"ws":true},{"text":"minced","start":84,"end":90,"id":20,"ws":true},{"text":"garlic","start":91,"end":97,"id":21,"ws":true},{"text":"in","start":98,"end":100,"id":22,"ws":true},{"text":"the","start":101,"end":104,"id":23,"ws":true},{"text":"butter","start":105,"end":111,"id":24,"ws":false},{"text":".","start":111,"end":112,"id":25,"ws":false}],"spans":[{"token_start":2,"token_end":2,"start":8,"end":14,"text":"butter","label":"INGREDIENT"},{"token_start":6,"token_end":6,"start":22,"end":26,"text":"parm","label":"INGREDIENT"},{"token_start":11,"token_end":12,"start":40,"end":51,"text":"heavy cream","label":"INGREDIENT"},{"token_start":20,"token_end":21,"start":84,"end":97,"text":"minced garlic","label":"INGREDIENT"},{"token_start":24,"token_end":24,"start":105,"end":111,"text":"butter","label":"INGREDIENT"}],"llm":{"prompt":"You are an expert Named Entity Recognition (NER) system. Your task is to accept Text as input and extract named entities for the set of predefined entity labels.\nFrom the Text input provided, extract named entities for each label in the following format:\n\nDISH: <comma delimited list of strings>\nEQUIPMENT: <comma delimited list of strings>\nINGREDIENT: <comma delimited list of strings>\n\nBelow are definitions of each label to help aid you in what kinds of named entities to extract for each label.\nAssume these definitions are written by an expert and follow them closely.\n\nDISH: Extract the name of a known dish.\nINGREDIENT: Extract the name of a cooking ingredient, including herbs and spices.\nEQUIPMENT: Extract any mention of cooking equipment. e.g. oven, cooking pot, grill\n\n\nBelow are some examples (only use these as a guide):\n\nText:\n'''\nYou can probably sand-blast it if it's an anodized aluminum pan.\n'''\n\nEQUIPMENT: anodized aluminum pan\n\n\nHere is the text that needs labeling:\n\nText:\n'''\n1/2 cup butter, 1 cup parm, and 1/2 cup heavy cream. Optionally put a half clove of minced garlic in the butter.\n'''","response":"EQUIPMENT: None\nINGREDIENT: butter, parm, heavy cream, minced garlic"},"_view_id":"ner_manual","_is_binary":false,"answer":"accept","_timestamp":1698172783,"_annotator_id":"2023-10-24_14-36-43","_session_id":"2023-10-24_14-36-43"}
4
+ {"text":"The timing would be about right for the turkey, but what are you doing for sides? May just need to adjust your dinner time a bit.","meta":{"explanation_label":"Turkey could be a dish or ingredient; when in doubt, infer from the context. In this example, it'll be a dish since it's the final cooking output.","explanation_boundaries":null},"_input_hash":974804537,"_task_hash":-1041528534,"tokens":[{"text":"The","start":0,"end":3,"id":0,"ws":true},{"text":"timing","start":4,"end":10,"id":1,"ws":true},{"text":"would","start":11,"end":16,"id":2,"ws":true},{"text":"be","start":17,"end":19,"id":3,"ws":true},{"text":"about","start":20,"end":25,"id":4,"ws":true},{"text":"right","start":26,"end":31,"id":5,"ws":true},{"text":"for","start":32,"end":35,"id":6,"ws":true},{"text":"the","start":36,"end":39,"id":7,"ws":true},{"text":"turkey","start":40,"end":46,"id":8,"ws":false},{"text":",","start":46,"end":47,"id":9,"ws":true},{"text":"but","start":48,"end":51,"id":10,"ws":true},{"text":"what","start":52,"end":56,"id":11,"ws":true},{"text":"are","start":57,"end":60,"id":12,"ws":true},{"text":"you","start":61,"end":64,"id":13,"ws":true},{"text":"doing","start":65,"end":70,"id":14,"ws":true},{"text":"for","start":71,"end":74,"id":15,"ws":true},{"text":"sides","start":75,"end":80,"id":16,"ws":false},{"text":"?","start":80,"end":81,"id":17,"ws":true},{"text":"May","start":82,"end":85,"id":18,"ws":true},{"text":"just","start":86,"end":90,"id":19,"ws":true},{"text":"need","start":91,"end":95,"id":20,"ws":true},{"text":"to","start":96,"end":98,"id":21,"ws":true},{"text":"adjust","start":99,"end":105,"id":22,"ws":true},{"text":"your","start":106,"end":110,"id":23,"ws":true},{"text":"dinner","start":111,"end":117,"id":24,"ws":true},{"text":"time","start":118,"end":122,"id":25,"ws":true},{"text":"a","start":123,"end":124,"id":26,"ws":true},{"text":"bit","start":125,"end":128,"id":27,"ws":false},{"text":".","start":128,"end":129,"id":28,"ws":false}],"spans":[{"start":40,"end":46,"text":"turkey","token_start":8,"token_end":8,"label":"DISH"}],"llm":{"prompt":"You are an expert Named Entity Recognition (NER) system. Your task is to accept Text as input and extract named entities for the set of predefined entity labels.\nFrom the Text input provided, extract named entities for each label in the following format:\n\nDISH: <comma delimited list of strings>\nEQUIPMENT: <comma delimited list of strings>\nINGREDIENT: <comma delimited list of strings>\n\nBelow are definitions of each label to help aid you in what kinds of named entities to extract for each label.\nAssume these definitions are written by an expert and follow them closely.\n\nDISH: Extract the name of a known dish.\nINGREDIENT: Extract the name of a cooking ingredient, including herbs and spices.\nEQUIPMENT: Extract any mention of cooking equipment. e.g. oven, cooking pot, grill\n\n\nBelow are some examples (only use these as a guide):\n\nText:\n'''\nYou can probably sand-blast it if it's an anodized aluminum pan.\n'''\n\nEQUIPMENT: anodized aluminum pan\n\n\nHere is the text that needs labeling:\n\nText:\n'''\nThe timing would be about right for the turkey, but what are you doing for sides? May just need to adjust your dinner time a bit.\n'''","response":"DISH: turkey\nEQUIPMENT: None\nINGREDIENT: sides"},"_view_id":"ner_manual","_is_binary":false,"answer":"accept","_timestamp":1698174298,"_annotator_id":"2023-10-24_14-36-43","_session_id":"2023-10-24_14-36-43"}
5
+ {"text":"If you're into Italian/Sicilian try Lidia's Italy (PBS or YouTube).","meta":{"explanation_label":"Lidia's Italy is the title of a show, not a dish.","explanation_boundaries":null},"_input_hash":-929486735,"_task_hash":-815730362,"tokens":[{"text":"If","start":0,"end":2,"id":0,"ws":true},{"text":"you","start":3,"end":6,"id":1,"ws":false},{"text":"'re","start":6,"end":9,"id":2,"ws":true},{"text":"into","start":10,"end":14,"id":3,"ws":true},{"text":"Italian","start":15,"end":22,"id":4,"ws":false},{"text":"/","start":22,"end":23,"id":5,"ws":false},{"text":"Sicilian","start":23,"end":31,"id":6,"ws":true},{"text":"try","start":32,"end":35,"id":7,"ws":true},{"text":"Lidia","start":36,"end":41,"id":8,"ws":false},{"text":"'s","start":41,"end":43,"id":9,"ws":true},{"text":"Italy","start":44,"end":49,"id":10,"ws":true},{"text":"(","start":50,"end":51,"id":11,"ws":false},{"text":"PBS","start":51,"end":54,"id":12,"ws":true},{"text":"or","start":55,"end":57,"id":13,"ws":true},{"text":"YouTube","start":58,"end":65,"id":14,"ws":false},{"text":")","start":65,"end":66,"id":15,"ws":false},{"text":".","start":66,"end":67,"id":16,"ws":false}],"spans":[],"llm":{"prompt":"You are an expert Named Entity Recognition (NER) system. Your task is to accept Text as input and extract named entities for the set of predefined entity labels.\nFrom the Text input provided, extract named entities for each label in the following format:\n\nDISH: <comma delimited list of strings>\nEQUIPMENT: <comma delimited list of strings>\nINGREDIENT: <comma delimited list of strings>\n\nBelow are definitions of each label to help aid you in what kinds of named entities to extract for each label.\nAssume these definitions are written by an expert and follow them closely.\n\nDISH: Extract the name of a known dish.\nINGREDIENT: Extract the name of a cooking ingredient, including herbs and spices.\nEQUIPMENT: Extract any mention of cooking equipment. e.g. oven, cooking pot, grill\n\n\nBelow are some examples (only use these as a guide):\n\nText:\n'''\nYou can probably sand-blast it if it's an anodized aluminum pan.\n'''\n\nEQUIPMENT: anodized aluminum pan\n\n\nHere is the text that needs labeling:\n\nText:\n'''\nIf you're into Italian/Sicilian try Lidia's Italy (PBS or YouTube).\n'''","response":"DISH: Lidia's Italy\nEQUIPMENT: None\nINGREDIENT: None"},"_view_id":"ner_manual","_is_binary":false,"answer":"accept","_timestamp":1698173755,"_annotator_id":"2023-10-24_14-36-43","_session_id":"2023-10-24_14-36-43"}
data/reddit_r_cooking_sample.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
prodigy.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "port": 7860,
3
+ "host": "0.0.0.0",
4
+ "db": "sqlite",
5
+ "db_settings": {
6
+ "sqlite": {
7
+ "name": "prodigy.db",
8
+ "path": "/app"
9
+ }
10
+ },
11
+ "max_sessions": 50,
12
+ "show_stats": true,
13
+ "hide_meta": true
14
+ }
prodigy.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python -m prodigy ner.qa.manual qa-ex blank:en data/known_answers_cooking.jsonl --label DISH,INGREDIENT,EQUIPMENT -F recipe.py
recipe.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from functools import partial
3
+ from typing import Callable, Iterable, List, Optional, Tuple, Union, Dict, Any
4
+
5
+ import murmurhash
6
+ from spacy.language import Language
7
+ from spacy.tokens.doc import SetEntsDefault # type: ignore
8
+ from spacy.training import Example
9
+ from spacy.util import filter_spans
10
+
11
+ from prodigy.components.db import connect
12
+ from prodigy.components.decorators import support_both_streams
13
+ from prodigy.components.filters import filter_seen_before
14
+ from prodigy.components.preprocess import (
15
+ add_annot_name,
16
+ add_tokens,
17
+ add_view_id,
18
+ make_ner_suggestions,
19
+ make_raw_doc,
20
+ resolve_labels,
21
+ split_sentences,
22
+ )
23
+ from prodigy.components.sorters import prefer_uncertain
24
+ from prodigy.components.source import GeneratorSource
25
+ from prodigy.components.stream import Stream, get_stream, load_noop
26
+ from prodigy.core import Arg, recipe
27
+ from prodigy.errors import RecipeError
28
+ from prodigy.models.matcher import PatternMatcher
29
+ from prodigy.models.ner import EntityRecognizerModel, ensure_sentencizer
30
+ from prodigy.protocols import ControllerComponentsDict
31
+ from prodigy.types import (
32
+ ExistingFilePath,
33
+ LabelsType,
34
+ SourceType,
35
+ StreamType,
36
+ TaskType,
37
+ )
38
+ from prodigy.util import (
39
+ ANNOTATOR_ID_ATTR,
40
+ BINARY_ATTR,
41
+ INPUT_HASH_ATTR,
42
+ TASK_HASH_ATTR,
43
+ combine_models,
44
+ copy_nlp,
45
+ get_pipe_labels,
46
+ log,
47
+ msg,
48
+ set_hashes,
49
+ )
50
+
51
+
52
+
53
+ def modify_spans(document):
54
+ # Modify the 'spans' key to be an empty list
55
+ document['spans'] = []
56
+ return document
57
+
58
+ def spans_equal(s1: Dict[str, Any], s2: Dict[str, Any]) -> bool:
59
+ """Checks if two spans are equal"""
60
+ return s1["start"] == s2["start"] and s1["end"] == s2["end"]
61
+
62
+
63
+ def labels_equal(s1: Dict[str, Any], s2: Dict[str, Any]) -> bool:
64
+ """Checks if two spans have the same label"""
65
+ return s1["label"] == s2["label"]
66
+
67
+
68
+ def ensure_span_text(eg: TaskType) -> TaskType:
69
+ """Ensure that all spans have a text attribute"""
70
+ for span in eg.get("spans", []):
71
+ if "text" not in span:
72
+ span["text"] = eg["text"][span["start"] : span["end"]]
73
+ return eg
74
+
75
+
76
+ def validate_answer(answer: TaskType, *, known_answers_map: Dict[int, TaskType]):
77
+ """Validate the answer against the known answers"""
78
+
79
+ known_answer = known_answers_map.get(answer[INPUT_HASH_ATTR])
80
+ if known_answer is None:
81
+ print(f"Skipping validation for answer {answer[INPUT_HASH_ATTR]}, no known answer found to validate against.")
82
+ return
83
+
84
+ known_answer = ensure_span_text(known_answer)
85
+ errors = []
86
+ known_spans = known_answer.get("spans", [])
87
+ answer_spans = answer.get("spans", [])
88
+ explanation_label = known_answer.get("meta", {}).get("explanation_label")
89
+ explanation_boundaries = known_answer.get("meta", {}).get(
90
+ "explanation_boundaries"
91
+ )
92
+ if not explanation_boundaries:
93
+ explanation_boundaries = (
94
+ "No explanation boundaries"
95
+ )
96
+
97
+ if len(known_spans) > len(answer_spans):
98
+ errors.append(
99
+ "You noted fewer entities than expected for this answer. All mentions must be annotated"
100
+ )
101
+ elif len(known_spans) < len(answer_spans):
102
+ errors.append(
103
+ "You noted more entities than expected for this answer."
104
+ )
105
+ if not known_spans:
106
+ # For cases where no annotations are expected
107
+ errors.append(explanation_label)
108
+ for known_span, span in zip(known_spans, answer_spans):
109
+ if not labels_equal(known_span, span):
110
+ # label error
111
+ errors.append(explanation_label)
112
+ continue
113
+ if not spans_equal(known_span, span):
114
+ # boundary error
115
+ errors.append(explanation_boundaries)
116
+ continue
117
+
118
+ if len(errors) > 0:
119
+ error_msg = "\n".join(errors)
120
+ error_msg += "\n\nExpected annotations:"
121
+ if known_spans:
122
+ expected_spans = [
123
+ f'[{s["text"]}]: {s["label"]}' for s in known_spans
124
+ ]
125
+ if expected_spans:
126
+ error_msg += "\n"
127
+ for span_msg in expected_spans:
128
+ error_msg += span_msg + "\n"
129
+ else:
130
+ error_msg += "\n\nNone."
131
+ raise ValueError(error_msg)
132
+
133
+ @recipe(
134
+ "ner.qa.manual",
135
+ # fmt: off
136
+ dataset=Arg(help="Dataset to save annotations to"),
137
+ nlp=Arg(help="Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)"),
138
+ source=Arg(help="Data to annotate (file path or '-' to read from standard input)"),
139
+ loader=Arg("--loader", "-lo", help="Loader (guessed from file extension if not set)"),
140
+ label=Arg("--label", "-l", help="Comma-separated label(s) to annotate or text file with one label per line"),
141
+ patterns=Arg("--patterns", "-pt", help="Path to match patterns file"),
142
+ exclude=Arg("--exclude", "-e", help="Comma-separated list of dataset IDs whose annotations to exclude"),
143
+ highlight_chars=Arg("--highlight-chars", "-C", help="Allow highlighting individual characters instead of tokens"),
144
+ # fmt: on
145
+ )
146
+ def manual(
147
+ dataset: str,
148
+ nlp: Language,
149
+ source: SourceType,
150
+ loader: Optional[str] = None,
151
+ label: Optional[LabelsType] = None,
152
+ patterns: Optional[ExistingFilePath] = None,
153
+ exclude: List[str] = [],
154
+ highlight_chars: bool = False,
155
+ ) -> ControllerComponentsDict:
156
+ """
157
+ Mark spans by token. Requires only a tokenizer and no entity recognizer,
158
+ and doesn't do any active learning. If patterns are provided, their matches
159
+ are highlighted in the example, if available. The recipe will present
160
+ all examples in order, so even examples without matches are shown. If
161
+ character highlighting is enabled, no "tokens" are saved to the database.
162
+ """
163
+ log("RECIPE: Starting recipe ner.manual", locals())
164
+ labels = get_pipe_labels(label, nlp.pipe_labels.get("ner", []))
165
+ stream = get_stream(
166
+ source,
167
+ loader=loader,
168
+ rehash=True,
169
+ dedup=True,
170
+ input_key="text",
171
+ is_binary=False,
172
+ )
173
+ if patterns is not None:
174
+ pattern_matcher = PatternMatcher(nlp, combine_matches=True, all_examples=True)
175
+ pattern_matcher = pattern_matcher.from_disk(patterns)
176
+ stream.apply(lambda examples: (eg for _, eg in pattern_matcher(examples)))
177
+ # Add "tokens" key to the tasks, either with words or characters
178
+ stream.apply(lambda examples: (modify_spans(eg) for eg in examples))
179
+ exclude_names = [ds.name for ds in exclude] if exclude is not None else None
180
+
181
+ known_answers = get_stream(
182
+ source,
183
+ loader=loader,
184
+ rehash=True,
185
+ dedup=True,
186
+ input_key="text",
187
+ is_binary=False,
188
+ )
189
+ known_answers_map = {eg[INPUT_HASH_ATTR]: eg for eg in known_answers}
190
+
191
+ return {
192
+ "view_id": "ner_manual",
193
+ "dataset": dataset,
194
+ "stream": [_ for _ in stream],
195
+ "exclude": exclude_names,
196
+ "validate_answer": partial(validate_answer, known_answers_map=known_answers_map),
197
+ "config": {
198
+ "lang": nlp.lang,
199
+ "labels": labels,
200
+ "exclude_by": "input",
201
+ "ner_manual_highlight_chars": highlight_chars,
202
+ },
203
+ }
204
+
205
+
206
+ @support_both_streams(stream_arg="stream")
207
+ def preprocess_stream(
208
+ stream: StreamType,
209
+ nlp: Language,
210
+ *,
211
+ labels: Optional[List[str]],
212
+ unsegmented: bool,
213
+ set_annotations: bool = True,
214
+ ) -> StreamType:
215
+ if not unsegmented:
216
+ stream = split_sentences(nlp, stream) # type: ignore
217
+ stream = add_tokens(nlp, stream) # type: ignore
218
+ if set_annotations:
219
+ spacy_model = f"{nlp.meta['lang']}_{nlp.meta['name']}"
220
+ # Add a 'spans' key to each example, with predicted entities
221
+ texts = ((eg["text"], eg) for eg in stream)
222
+ for doc, eg in nlp.pipe(texts, as_tuples=True, batch_size=10):
223
+ task = copy.deepcopy(eg)
224
+ spans = []
225
+ for ent in doc.ents:
226
+ if labels and ent.label_ not in labels:
227
+ continue
228
+ spans.append(ent)
229
+ for span in eg.get("spans", []):
230
+ spans.append(doc.char_span(span["start"], span["end"], span["label"]))
231
+ spans = filter_spans(spans)
232
+ span_dicts = []
233
+ for ent in spans:
234
+ span_dicts.append(
235
+ {
236
+ "token_start": ent.start,
237
+ "token_end": ent.end - 1,
238
+ "start": ent.start_char,
239
+ "end": ent.end_char,
240
+ "text": ent.text,
241
+ "label": ent.label_,
242
+ "source": spacy_model,
243
+ "input_hash": eg[INPUT_HASH_ATTR],
244
+ }
245
+ )
246
+ task["spans"] = span_dicts
247
+ task[BINARY_ATTR] = False
248
+ task = set_hashes(task)
249
+ yield task
250
+ else:
251
+ yield from stream
252
+
253
+
254
+ def get_ner_labels(
255
+ nlp: Language, *, label: Optional[List[str]], component: str = "ner"
256
+ ) -> Tuple[List[str], bool]:
257
+ model_labels = nlp.pipe_labels.get(component, [])
258
+ labels = get_pipe_labels(label, model_labels)
259
+ # Check if we're annotating all labels present in the model or a subset
260
+ no_missing = len(set(labels).intersection(set(model_labels))) == len(model_labels)
261
+ return labels, no_missing
262
+
263
+
264
+ def get_update(nlp: Language, *, no_missing: bool) -> Callable[[List[TaskType]], None]:
265
+ def update(answers: List[TaskType]) -> None:
266
+ log(f"RECIPE: Updating model with {len(answers)} answers")
267
+ examples = []
268
+ for eg in answers:
269
+ if eg["answer"] == "accept":
270
+ doc = make_raw_doc(nlp, eg)
271
+ ref = make_raw_doc(nlp, eg)
272
+ spans = [
273
+ doc.char_span(span["start"], span["end"], label=span["label"])
274
+ for span in eg.get("spans", [])
275
+ ]
276
+ value = SetEntsDefault.outside if no_missing else SetEntsDefault.missing
277
+ ref.set_ents(spans, default=value)
278
+ examples.append(Example(doc, ref))
279
+ nlp.update(examples)
280
+
281
+ return update