Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .DS_Store +0 -0
- Dockerfile +37 -0
- __pycache__/recipe.cpython-39.pyc +0 -0
- data/dataset.jsonl +0 -0
- data/known_answers_cooking.jsonl +5 -0
- data/reddit_r_cooking_sample.jsonl +0 -0
- prodigy.json +14 -0
- prodigy.sh +1 -0
- recipe.py +281 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
Dockerfile
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
#COPY requirements.txt /app/
|
4 |
+
WORKDIR /app
|
5 |
+
# # Set up a new user named "user" with user ID 1000
|
6 |
+
# RUN useradd -m -u 1000 user
|
7 |
+
# # Switch to the "user" user
|
8 |
+
# USER user
|
9 |
+
# # Set home to the user's home directory
|
10 |
+
# ENV HOME=/home/user \
|
11 |
+
# PATH=/home/user/.local/bin:$PATH
|
12 |
+
|
13 |
+
# # Set the working directory to the user's home directory
|
14 |
+
# WORKDIR $HOME/app
|
15 |
+
|
16 |
+
# # Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
17 |
+
# COPY --chown=user . $HOME/app
|
18 |
+
|
19 |
+
RUN --mount=type=secret,id=LICENSE_KEY,mode=0444,required=true \
|
20 |
+
pip install --upgrade pip \
|
21 |
+
&& pip install typing_extensions==4.5.0 \
|
22 |
+
&& pip install --quiet prodigy -f https://$(cat /run/secrets/LICENSE_KEY)@download.prodi.gy
|
23 |
+
|
24 |
+
RUN chmod 777 .
|
25 |
+
|
26 |
+
COPY prodigy.json .
|
27 |
+
COPY data ./data/
|
28 |
+
COPY recipe.py .
|
29 |
+
COPY prodigy.sh .
|
30 |
+
|
31 |
+
ENV PRODIGY_HOME /app
|
32 |
+
ENV PRODIGY_LOGGING "verbose"
|
33 |
+
ENV MAX_SESSIONS 50
|
34 |
+
|
35 |
+
EXPOSE 7860
|
36 |
+
|
37 |
+
CMD ["bash","prodigy.sh"]
|
__pycache__/recipe.cpython-39.pyc
ADDED
Binary file (9.29 kB). View file
|
|
data/dataset.jsonl
ADDED
File without changes
|
data/known_answers_cooking.jsonl
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"text":"Cauliflower leaves. Good in a curry, stew, stir fry. Only found out about it recently..","meta":{"explanation_label":"Cauliflower leaves is an ingredient since in the context it's used in the dishes. Curry, stew, and stir fry are dishes as they are the final product of the ingredients like cauliflower leaves.","explanation_boundaries":null},"_input_hash":-1856281439,"_task_hash":-1652827864,"tokens":[{"text":"Cauliflower","start":0,"end":11,"id":0,"ws":true},{"text":"leaves","start":12,"end":18,"id":1,"ws":false},{"text":".","start":18,"end":19,"id":2,"ws":true},{"text":"Good","start":20,"end":24,"id":3,"ws":true},{"text":"in","start":25,"end":27,"id":4,"ws":true},{"text":"a","start":28,"end":29,"id":5,"ws":true},{"text":"curry","start":30,"end":35,"id":6,"ws":false},{"text":",","start":35,"end":36,"id":7,"ws":true},{"text":"stew","start":37,"end":41,"id":8,"ws":false},{"text":",","start":41,"end":42,"id":9,"ws":true},{"text":"stir","start":43,"end":47,"id":10,"ws":true},{"text":"fry","start":48,"end":51,"id":11,"ws":false},{"text":".","start":51,"end":52,"id":12,"ws":true},{"text":"Only","start":53,"end":57,"id":13,"ws":true},{"text":"found","start":58,"end":63,"id":14,"ws":true},{"text":"out","start":64,"end":67,"id":15,"ws":true},{"text":"about","start":68,"end":73,"id":16,"ws":true},{"text":"it","start":74,"end":76,"id":17,"ws":true},{"text":"recently","start":77,"end":85,"id":18,"ws":false},{"text":"..","start":85,"end":87,"id":19,"ws":false}],"spans":[{"token_start":0,"token_end":1,"start":0,"end":18,"text":"Cauliflower leaves","label":"INGREDIENT"},{"token_start":6,"token_end":6,"start":30,"end":35,"text":"curry","label":"DISH"},{"token_start":8,"token_end":8,"start":37,"end":41,"text":"stew","label":"DISH"},{"token_start":10,"token_end":11,"start":43,"end":51,"text":"stir fry","label":"DISH"}],"llm":{"prompt":"You are an expert Named Entity Recognition (NER) system. Your task is to accept Text as input and extract named entities for the set of predefined entity labels.\nFrom the Text input provided, extract named entities for each label in the following format:\n\nDISH: <comma delimited list of strings>\nEQUIPMENT: <comma delimited list of strings>\nINGREDIENT: <comma delimited list of strings>\n\nBelow are definitions of each label to help aid you in what kinds of named entities to extract for each label.\nAssume these definitions are written by an expert and follow them closely.\n\nDISH: Extract the name of a known dish.\nINGREDIENT: Extract the name of a cooking ingredient, including herbs and spices.\nEQUIPMENT: Extract any mention of cooking equipment. e.g. oven, cooking pot, grill\n\n\nBelow are some examples (only use these as a guide):\n\nText:\n'''\nYou can probably sand-blast it if it's an anodized aluminum pan.\n'''\n\nEQUIPMENT: anodized aluminum pan\n\n\nHere is the text that needs labeling:\n\nText:\n'''\nCauliflower leaves. Good in a curry, stew, stir fry. Only found out about it recently..\n'''","response":"DISH: curry, stew, stir fry\nINGREDIENT: Cauliflower leaves\nEQUIPMENT:"},"_view_id":"ner_manual","_is_binary":false,"answer":"accept","_timestamp":1698173684,"_annotator_id":"2023-10-24_14-36-43","_session_id":"2023-10-24_14-36-43"}
|
2 |
+
{"text":"My Pro 500 is going on 18 years old. Thing is a tank. Just don't drop it on your toe","meta":{"explanation_label":"Pro 500 is a popular blender by KitchenAid.","explanation_boundaries":"Incorrect boundaries as it included 'My'."},"_input_hash":664573206,"_task_hash":-2138876470,"tokens":[{"text":"My","start":0,"end":2,"id":0,"ws":true},{"text":"Pro","start":3,"end":6,"id":1,"ws":true},{"text":"500","start":7,"end":10,"id":2,"ws":true},{"text":"is","start":11,"end":13,"id":3,"ws":true},{"text":"going","start":14,"end":19,"id":4,"ws":true},{"text":"on","start":20,"end":22,"id":5,"ws":true},{"text":"18","start":23,"end":25,"id":6,"ws":true},{"text":"years","start":26,"end":31,"id":7,"ws":true},{"text":"old","start":32,"end":35,"id":8,"ws":false},{"text":".","start":35,"end":36,"id":9,"ws":true},{"text":"Thing","start":37,"end":42,"id":10,"ws":true},{"text":"is","start":43,"end":45,"id":11,"ws":true},{"text":"a","start":46,"end":47,"id":12,"ws":true},{"text":"tank","start":48,"end":52,"id":13,"ws":false},{"text":".","start":52,"end":53,"id":14,"ws":true},{"text":"Just","start":54,"end":58,"id":15,"ws":true},{"text":"do","start":59,"end":61,"id":16,"ws":false},{"text":"n't","start":61,"end":64,"id":17,"ws":true},{"text":"drop","start":65,"end":69,"id":18,"ws":true},{"text":"it","start":70,"end":72,"id":19,"ws":true},{"text":"on","start":73,"end":75,"id":20,"ws":true},{"text":"your","start":76,"end":80,"id":21,"ws":true},{"text":"toe","start":81,"end":84,"id":22,"ws":false}],"spans":[{"token_start":1,"token_end":2,"start":3,"end":10,"text":"Pro 500","label":"EQUIPMENT"}],"llm":{"prompt":"You are an expert Named Entity Recognition (NER) system. Your task is to accept Text as input and extract named entities for the set of predefined entity labels.\nFrom the Text input provided, extract named entities for each label in the following format:\n\nDISH: <comma delimited list of strings>\nEQUIPMENT: <comma delimited list of strings>\nINGREDIENT: <comma delimited list of strings>\n\nBelow are definitions of each label to help aid you in what kinds of named entities to extract for each label.\nAssume these definitions are written by an expert and follow them closely.\n\nDISH: Extract the name of a known dish.\nINGREDIENT: Extract the name of a cooking ingredient, including herbs and spices.\nEQUIPMENT: Extract any mention of cooking equipment. e.g. oven, cooking pot, grill\n\n\nBelow are some examples (only use these as a guide):\n\nText:\n'''\nYou can probably sand-blast it if it's an anodized aluminum pan.\n'''\n\nEQUIPMENT: anodized aluminum pan\n\n\nHere is the text that needs labeling:\n\nText:\n'''\nMy Pro 500 is going on 18 years old. Thing is a tank. Just don't drop it on your toe\n'''","response":"EQUIPMENT: Pro 500"},"_view_id":"ner_manual","_is_binary":false,"answer":"accept","_timestamp":1698173153,"_annotator_id":"2023-10-24_14-36-43","_session_id":"2023-10-24_14-36-43"}
|
3 |
+
{"text":"1/2 cup butter, 1 cup parm, and 1/2 cup heavy cream. Optionally put a half clove of minced garlic in the butter.","meta":{"explanation_label":"All of the entities are used and described as ingredients (inputs to dishes).","explanation_boundaries":"Measurements like cups or cloves are not to be included. Adjectives like 'minced' or 'heavy' should be included to better describe the ingredients."},"_input_hash":-2038609053,"_task_hash":481198127,"tokens":[{"text":"1/2","start":0,"end":3,"id":0,"ws":true},{"text":"cup","start":4,"end":7,"id":1,"ws":true},{"text":"butter","start":8,"end":14,"id":2,"ws":false},{"text":",","start":14,"end":15,"id":3,"ws":true},{"text":"1","start":16,"end":17,"id":4,"ws":true},{"text":"cup","start":18,"end":21,"id":5,"ws":true},{"text":"parm","start":22,"end":26,"id":6,"ws":false},{"text":",","start":26,"end":27,"id":7,"ws":true},{"text":"and","start":28,"end":31,"id":8,"ws":true},{"text":"1/2","start":32,"end":35,"id":9,"ws":true},{"text":"cup","start":36,"end":39,"id":10,"ws":true},{"text":"heavy","start":40,"end":45,"id":11,"ws":true},{"text":"cream","start":46,"end":51,"id":12,"ws":false},{"text":".","start":51,"end":52,"id":13,"ws":true},{"text":"Optionally","start":53,"end":63,"id":14,"ws":true},{"text":"put","start":64,"end":67,"id":15,"ws":true},{"text":"a","start":68,"end":69,"id":16,"ws":true},{"text":"half","start":70,"end":74,"id":17,"ws":true},{"text":"clove","start":75,"end":80,"id":18,"ws":true},{"text":"of","start":81,"end":83,"id":19,"ws":true},{"text":"minced","start":84,"end":90,"id":20,"ws":true},{"text":"garlic","start":91,"end":97,"id":21,"ws":true},{"text":"in","start":98,"end":100,"id":22,"ws":true},{"text":"the","start":101,"end":104,"id":23,"ws":true},{"text":"butter","start":105,"end":111,"id":24,"ws":false},{"text":".","start":111,"end":112,"id":25,"ws":false}],"spans":[{"token_start":2,"token_end":2,"start":8,"end":14,"text":"butter","label":"INGREDIENT"},{"token_start":6,"token_end":6,"start":22,"end":26,"text":"parm","label":"INGREDIENT"},{"token_start":11,"token_end":12,"start":40,"end":51,"text":"heavy cream","label":"INGREDIENT"},{"token_start":20,"token_end":21,"start":84,"end":97,"text":"minced garlic","label":"INGREDIENT"},{"token_start":24,"token_end":24,"start":105,"end":111,"text":"butter","label":"INGREDIENT"}],"llm":{"prompt":"You are an expert Named Entity Recognition (NER) system. Your task is to accept Text as input and extract named entities for the set of predefined entity labels.\nFrom the Text input provided, extract named entities for each label in the following format:\n\nDISH: <comma delimited list of strings>\nEQUIPMENT: <comma delimited list of strings>\nINGREDIENT: <comma delimited list of strings>\n\nBelow are definitions of each label to help aid you in what kinds of named entities to extract for each label.\nAssume these definitions are written by an expert and follow them closely.\n\nDISH: Extract the name of a known dish.\nINGREDIENT: Extract the name of a cooking ingredient, including herbs and spices.\nEQUIPMENT: Extract any mention of cooking equipment. e.g. oven, cooking pot, grill\n\n\nBelow are some examples (only use these as a guide):\n\nText:\n'''\nYou can probably sand-blast it if it's an anodized aluminum pan.\n'''\n\nEQUIPMENT: anodized aluminum pan\n\n\nHere is the text that needs labeling:\n\nText:\n'''\n1/2 cup butter, 1 cup parm, and 1/2 cup heavy cream. Optionally put a half clove of minced garlic in the butter.\n'''","response":"EQUIPMENT: None\nINGREDIENT: butter, parm, heavy cream, minced garlic"},"_view_id":"ner_manual","_is_binary":false,"answer":"accept","_timestamp":1698172783,"_annotator_id":"2023-10-24_14-36-43","_session_id":"2023-10-24_14-36-43"}
|
4 |
+
{"text":"The timing would be about right for the turkey, but what are you doing for sides? May just need to adjust your dinner time a bit.","meta":{"explanation_label":"Turkey could be a dish or ingredient; when in doubt, infer from the context. In this example, it'll be a dish since it's the final cooking output.","explanation_boundaries":null},"_input_hash":974804537,"_task_hash":-1041528534,"tokens":[{"text":"The","start":0,"end":3,"id":0,"ws":true},{"text":"timing","start":4,"end":10,"id":1,"ws":true},{"text":"would","start":11,"end":16,"id":2,"ws":true},{"text":"be","start":17,"end":19,"id":3,"ws":true},{"text":"about","start":20,"end":25,"id":4,"ws":true},{"text":"right","start":26,"end":31,"id":5,"ws":true},{"text":"for","start":32,"end":35,"id":6,"ws":true},{"text":"the","start":36,"end":39,"id":7,"ws":true},{"text":"turkey","start":40,"end":46,"id":8,"ws":false},{"text":",","start":46,"end":47,"id":9,"ws":true},{"text":"but","start":48,"end":51,"id":10,"ws":true},{"text":"what","start":52,"end":56,"id":11,"ws":true},{"text":"are","start":57,"end":60,"id":12,"ws":true},{"text":"you","start":61,"end":64,"id":13,"ws":true},{"text":"doing","start":65,"end":70,"id":14,"ws":true},{"text":"for","start":71,"end":74,"id":15,"ws":true},{"text":"sides","start":75,"end":80,"id":16,"ws":false},{"text":"?","start":80,"end":81,"id":17,"ws":true},{"text":"May","start":82,"end":85,"id":18,"ws":true},{"text":"just","start":86,"end":90,"id":19,"ws":true},{"text":"need","start":91,"end":95,"id":20,"ws":true},{"text":"to","start":96,"end":98,"id":21,"ws":true},{"text":"adjust","start":99,"end":105,"id":22,"ws":true},{"text":"your","start":106,"end":110,"id":23,"ws":true},{"text":"dinner","start":111,"end":117,"id":24,"ws":true},{"text":"time","start":118,"end":122,"id":25,"ws":true},{"text":"a","start":123,"end":124,"id":26,"ws":true},{"text":"bit","start":125,"end":128,"id":27,"ws":false},{"text":".","start":128,"end":129,"id":28,"ws":false}],"spans":[{"start":40,"end":46,"text":"turkey","token_start":8,"token_end":8,"label":"DISH"}],"llm":{"prompt":"You are an expert Named Entity Recognition (NER) system. Your task is to accept Text as input and extract named entities for the set of predefined entity labels.\nFrom the Text input provided, extract named entities for each label in the following format:\n\nDISH: <comma delimited list of strings>\nEQUIPMENT: <comma delimited list of strings>\nINGREDIENT: <comma delimited list of strings>\n\nBelow are definitions of each label to help aid you in what kinds of named entities to extract for each label.\nAssume these definitions are written by an expert and follow them closely.\n\nDISH: Extract the name of a known dish.\nINGREDIENT: Extract the name of a cooking ingredient, including herbs and spices.\nEQUIPMENT: Extract any mention of cooking equipment. e.g. oven, cooking pot, grill\n\n\nBelow are some examples (only use these as a guide):\n\nText:\n'''\nYou can probably sand-blast it if it's an anodized aluminum pan.\n'''\n\nEQUIPMENT: anodized aluminum pan\n\n\nHere is the text that needs labeling:\n\nText:\n'''\nThe timing would be about right for the turkey, but what are you doing for sides? May just need to adjust your dinner time a bit.\n'''","response":"DISH: turkey\nEQUIPMENT: None\nINGREDIENT: sides"},"_view_id":"ner_manual","_is_binary":false,"answer":"accept","_timestamp":1698174298,"_annotator_id":"2023-10-24_14-36-43","_session_id":"2023-10-24_14-36-43"}
|
5 |
+
{"text":"If you're into Italian/Sicilian try Lidia's Italy (PBS or YouTube).","meta":{"explanation_label":"Lidia's Italy is the title of a show, not a dish.","explanation_boundaries":null},"_input_hash":-929486735,"_task_hash":-815730362,"tokens":[{"text":"If","start":0,"end":2,"id":0,"ws":true},{"text":"you","start":3,"end":6,"id":1,"ws":false},{"text":"'re","start":6,"end":9,"id":2,"ws":true},{"text":"into","start":10,"end":14,"id":3,"ws":true},{"text":"Italian","start":15,"end":22,"id":4,"ws":false},{"text":"/","start":22,"end":23,"id":5,"ws":false},{"text":"Sicilian","start":23,"end":31,"id":6,"ws":true},{"text":"try","start":32,"end":35,"id":7,"ws":true},{"text":"Lidia","start":36,"end":41,"id":8,"ws":false},{"text":"'s","start":41,"end":43,"id":9,"ws":true},{"text":"Italy","start":44,"end":49,"id":10,"ws":true},{"text":"(","start":50,"end":51,"id":11,"ws":false},{"text":"PBS","start":51,"end":54,"id":12,"ws":true},{"text":"or","start":55,"end":57,"id":13,"ws":true},{"text":"YouTube","start":58,"end":65,"id":14,"ws":false},{"text":")","start":65,"end":66,"id":15,"ws":false},{"text":".","start":66,"end":67,"id":16,"ws":false}],"spans":[],"llm":{"prompt":"You are an expert Named Entity Recognition (NER) system. Your task is to accept Text as input and extract named entities for the set of predefined entity labels.\nFrom the Text input provided, extract named entities for each label in the following format:\n\nDISH: <comma delimited list of strings>\nEQUIPMENT: <comma delimited list of strings>\nINGREDIENT: <comma delimited list of strings>\n\nBelow are definitions of each label to help aid you in what kinds of named entities to extract for each label.\nAssume these definitions are written by an expert and follow them closely.\n\nDISH: Extract the name of a known dish.\nINGREDIENT: Extract the name of a cooking ingredient, including herbs and spices.\nEQUIPMENT: Extract any mention of cooking equipment. e.g. oven, cooking pot, grill\n\n\nBelow are some examples (only use these as a guide):\n\nText:\n'''\nYou can probably sand-blast it if it's an anodized aluminum pan.\n'''\n\nEQUIPMENT: anodized aluminum pan\n\n\nHere is the text that needs labeling:\n\nText:\n'''\nIf you're into Italian/Sicilian try Lidia's Italy (PBS or YouTube).\n'''","response":"DISH: Lidia's Italy\nEQUIPMENT: None\nINGREDIENT: None"},"_view_id":"ner_manual","_is_binary":false,"answer":"accept","_timestamp":1698173755,"_annotator_id":"2023-10-24_14-36-43","_session_id":"2023-10-24_14-36-43"}
|
data/reddit_r_cooking_sample.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
prodigy.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"port": 7860,
|
3 |
+
"host": "0.0.0.0",
|
4 |
+
"db": "sqlite",
|
5 |
+
"db_settings": {
|
6 |
+
"sqlite": {
|
7 |
+
"name": "prodigy.db",
|
8 |
+
"path": "/app"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"max_sessions": 50,
|
12 |
+
"show_stats": true,
|
13 |
+
"hide_meta": true
|
14 |
+
}
|
prodigy.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python -m prodigy ner.qa.manual qa-ex blank:en data/known_answers_cooking.jsonl --label DISH,INGREDIENT,EQUIPMENT -F recipe.py
|
recipe.py
ADDED
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
from functools import partial
|
3 |
+
from typing import Callable, Iterable, List, Optional, Tuple, Union, Dict, Any
|
4 |
+
|
5 |
+
import murmurhash
|
6 |
+
from spacy.language import Language
|
7 |
+
from spacy.tokens.doc import SetEntsDefault # type: ignore
|
8 |
+
from spacy.training import Example
|
9 |
+
from spacy.util import filter_spans
|
10 |
+
|
11 |
+
from prodigy.components.db import connect
|
12 |
+
from prodigy.components.decorators import support_both_streams
|
13 |
+
from prodigy.components.filters import filter_seen_before
|
14 |
+
from prodigy.components.preprocess import (
|
15 |
+
add_annot_name,
|
16 |
+
add_tokens,
|
17 |
+
add_view_id,
|
18 |
+
make_ner_suggestions,
|
19 |
+
make_raw_doc,
|
20 |
+
resolve_labels,
|
21 |
+
split_sentences,
|
22 |
+
)
|
23 |
+
from prodigy.components.sorters import prefer_uncertain
|
24 |
+
from prodigy.components.source import GeneratorSource
|
25 |
+
from prodigy.components.stream import Stream, get_stream, load_noop
|
26 |
+
from prodigy.core import Arg, recipe
|
27 |
+
from prodigy.errors import RecipeError
|
28 |
+
from prodigy.models.matcher import PatternMatcher
|
29 |
+
from prodigy.models.ner import EntityRecognizerModel, ensure_sentencizer
|
30 |
+
from prodigy.protocols import ControllerComponentsDict
|
31 |
+
from prodigy.types import (
|
32 |
+
ExistingFilePath,
|
33 |
+
LabelsType,
|
34 |
+
SourceType,
|
35 |
+
StreamType,
|
36 |
+
TaskType,
|
37 |
+
)
|
38 |
+
from prodigy.util import (
|
39 |
+
ANNOTATOR_ID_ATTR,
|
40 |
+
BINARY_ATTR,
|
41 |
+
INPUT_HASH_ATTR,
|
42 |
+
TASK_HASH_ATTR,
|
43 |
+
combine_models,
|
44 |
+
copy_nlp,
|
45 |
+
get_pipe_labels,
|
46 |
+
log,
|
47 |
+
msg,
|
48 |
+
set_hashes,
|
49 |
+
)
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
def modify_spans(document):
|
54 |
+
# Modify the 'spans' key to be an empty list
|
55 |
+
document['spans'] = []
|
56 |
+
return document
|
57 |
+
|
58 |
+
def spans_equal(s1: Dict[str, Any], s2: Dict[str, Any]) -> bool:
|
59 |
+
"""Checks if two spans are equal"""
|
60 |
+
return s1["start"] == s2["start"] and s1["end"] == s2["end"]
|
61 |
+
|
62 |
+
|
63 |
+
def labels_equal(s1: Dict[str, Any], s2: Dict[str, Any]) -> bool:
|
64 |
+
"""Checks if two spans have the same label"""
|
65 |
+
return s1["label"] == s2["label"]
|
66 |
+
|
67 |
+
|
68 |
+
def ensure_span_text(eg: TaskType) -> TaskType:
|
69 |
+
"""Ensure that all spans have a text attribute"""
|
70 |
+
for span in eg.get("spans", []):
|
71 |
+
if "text" not in span:
|
72 |
+
span["text"] = eg["text"][span["start"] : span["end"]]
|
73 |
+
return eg
|
74 |
+
|
75 |
+
|
76 |
+
def validate_answer(answer: TaskType, *, known_answers_map: Dict[int, TaskType]):
|
77 |
+
"""Validate the answer against the known answers"""
|
78 |
+
|
79 |
+
known_answer = known_answers_map.get(answer[INPUT_HASH_ATTR])
|
80 |
+
if known_answer is None:
|
81 |
+
print(f"Skipping validation for answer {answer[INPUT_HASH_ATTR]}, no known answer found to validate against.")
|
82 |
+
return
|
83 |
+
|
84 |
+
known_answer = ensure_span_text(known_answer)
|
85 |
+
errors = []
|
86 |
+
known_spans = known_answer.get("spans", [])
|
87 |
+
answer_spans = answer.get("spans", [])
|
88 |
+
explanation_label = known_answer.get("meta", {}).get("explanation_label")
|
89 |
+
explanation_boundaries = known_answer.get("meta", {}).get(
|
90 |
+
"explanation_boundaries"
|
91 |
+
)
|
92 |
+
if not explanation_boundaries:
|
93 |
+
explanation_boundaries = (
|
94 |
+
"No explanation boundaries"
|
95 |
+
)
|
96 |
+
|
97 |
+
if len(known_spans) > len(answer_spans):
|
98 |
+
errors.append(
|
99 |
+
"You noted fewer entities than expected for this answer. All mentions must be annotated"
|
100 |
+
)
|
101 |
+
elif len(known_spans) < len(answer_spans):
|
102 |
+
errors.append(
|
103 |
+
"You noted more entities than expected for this answer."
|
104 |
+
)
|
105 |
+
if not known_spans:
|
106 |
+
# For cases where no annotations are expected
|
107 |
+
errors.append(explanation_label)
|
108 |
+
for known_span, span in zip(known_spans, answer_spans):
|
109 |
+
if not labels_equal(known_span, span):
|
110 |
+
# label error
|
111 |
+
errors.append(explanation_label)
|
112 |
+
continue
|
113 |
+
if not spans_equal(known_span, span):
|
114 |
+
# boundary error
|
115 |
+
errors.append(explanation_boundaries)
|
116 |
+
continue
|
117 |
+
|
118 |
+
if len(errors) > 0:
|
119 |
+
error_msg = "\n".join(errors)
|
120 |
+
error_msg += "\n\nExpected annotations:"
|
121 |
+
if known_spans:
|
122 |
+
expected_spans = [
|
123 |
+
f'[{s["text"]}]: {s["label"]}' for s in known_spans
|
124 |
+
]
|
125 |
+
if expected_spans:
|
126 |
+
error_msg += "\n"
|
127 |
+
for span_msg in expected_spans:
|
128 |
+
error_msg += span_msg + "\n"
|
129 |
+
else:
|
130 |
+
error_msg += "\n\nNone."
|
131 |
+
raise ValueError(error_msg)
|
132 |
+
|
133 |
+
@recipe(
|
134 |
+
"ner.qa.manual",
|
135 |
+
# fmt: off
|
136 |
+
dataset=Arg(help="Dataset to save annotations to"),
|
137 |
+
nlp=Arg(help="Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)"),
|
138 |
+
source=Arg(help="Data to annotate (file path or '-' to read from standard input)"),
|
139 |
+
loader=Arg("--loader", "-lo", help="Loader (guessed from file extension if not set)"),
|
140 |
+
label=Arg("--label", "-l", help="Comma-separated label(s) to annotate or text file with one label per line"),
|
141 |
+
patterns=Arg("--patterns", "-pt", help="Path to match patterns file"),
|
142 |
+
exclude=Arg("--exclude", "-e", help="Comma-separated list of dataset IDs whose annotations to exclude"),
|
143 |
+
highlight_chars=Arg("--highlight-chars", "-C", help="Allow highlighting individual characters instead of tokens"),
|
144 |
+
# fmt: on
|
145 |
+
)
|
146 |
+
def manual(
|
147 |
+
dataset: str,
|
148 |
+
nlp: Language,
|
149 |
+
source: SourceType,
|
150 |
+
loader: Optional[str] = None,
|
151 |
+
label: Optional[LabelsType] = None,
|
152 |
+
patterns: Optional[ExistingFilePath] = None,
|
153 |
+
exclude: List[str] = [],
|
154 |
+
highlight_chars: bool = False,
|
155 |
+
) -> ControllerComponentsDict:
|
156 |
+
"""
|
157 |
+
Mark spans by token. Requires only a tokenizer and no entity recognizer,
|
158 |
+
and doesn't do any active learning. If patterns are provided, their matches
|
159 |
+
are highlighted in the example, if available. The recipe will present
|
160 |
+
all examples in order, so even examples without matches are shown. If
|
161 |
+
character highlighting is enabled, no "tokens" are saved to the database.
|
162 |
+
"""
|
163 |
+
log("RECIPE: Starting recipe ner.manual", locals())
|
164 |
+
labels = get_pipe_labels(label, nlp.pipe_labels.get("ner", []))
|
165 |
+
stream = get_stream(
|
166 |
+
source,
|
167 |
+
loader=loader,
|
168 |
+
rehash=True,
|
169 |
+
dedup=True,
|
170 |
+
input_key="text",
|
171 |
+
is_binary=False,
|
172 |
+
)
|
173 |
+
if patterns is not None:
|
174 |
+
pattern_matcher = PatternMatcher(nlp, combine_matches=True, all_examples=True)
|
175 |
+
pattern_matcher = pattern_matcher.from_disk(patterns)
|
176 |
+
stream.apply(lambda examples: (eg for _, eg in pattern_matcher(examples)))
|
177 |
+
# Add "tokens" key to the tasks, either with words or characters
|
178 |
+
stream.apply(lambda examples: (modify_spans(eg) for eg in examples))
|
179 |
+
exclude_names = [ds.name for ds in exclude] if exclude is not None else None
|
180 |
+
|
181 |
+
known_answers = get_stream(
|
182 |
+
source,
|
183 |
+
loader=loader,
|
184 |
+
rehash=True,
|
185 |
+
dedup=True,
|
186 |
+
input_key="text",
|
187 |
+
is_binary=False,
|
188 |
+
)
|
189 |
+
known_answers_map = {eg[INPUT_HASH_ATTR]: eg for eg in known_answers}
|
190 |
+
|
191 |
+
return {
|
192 |
+
"view_id": "ner_manual",
|
193 |
+
"dataset": dataset,
|
194 |
+
"stream": [_ for _ in stream],
|
195 |
+
"exclude": exclude_names,
|
196 |
+
"validate_answer": partial(validate_answer, known_answers_map=known_answers_map),
|
197 |
+
"config": {
|
198 |
+
"lang": nlp.lang,
|
199 |
+
"labels": labels,
|
200 |
+
"exclude_by": "input",
|
201 |
+
"ner_manual_highlight_chars": highlight_chars,
|
202 |
+
},
|
203 |
+
}
|
204 |
+
|
205 |
+
|
206 |
+
@support_both_streams(stream_arg="stream")
|
207 |
+
def preprocess_stream(
|
208 |
+
stream: StreamType,
|
209 |
+
nlp: Language,
|
210 |
+
*,
|
211 |
+
labels: Optional[List[str]],
|
212 |
+
unsegmented: bool,
|
213 |
+
set_annotations: bool = True,
|
214 |
+
) -> StreamType:
|
215 |
+
if not unsegmented:
|
216 |
+
stream = split_sentences(nlp, stream) # type: ignore
|
217 |
+
stream = add_tokens(nlp, stream) # type: ignore
|
218 |
+
if set_annotations:
|
219 |
+
spacy_model = f"{nlp.meta['lang']}_{nlp.meta['name']}"
|
220 |
+
# Add a 'spans' key to each example, with predicted entities
|
221 |
+
texts = ((eg["text"], eg) for eg in stream)
|
222 |
+
for doc, eg in nlp.pipe(texts, as_tuples=True, batch_size=10):
|
223 |
+
task = copy.deepcopy(eg)
|
224 |
+
spans = []
|
225 |
+
for ent in doc.ents:
|
226 |
+
if labels and ent.label_ not in labels:
|
227 |
+
continue
|
228 |
+
spans.append(ent)
|
229 |
+
for span in eg.get("spans", []):
|
230 |
+
spans.append(doc.char_span(span["start"], span["end"], span["label"]))
|
231 |
+
spans = filter_spans(spans)
|
232 |
+
span_dicts = []
|
233 |
+
for ent in spans:
|
234 |
+
span_dicts.append(
|
235 |
+
{
|
236 |
+
"token_start": ent.start,
|
237 |
+
"token_end": ent.end - 1,
|
238 |
+
"start": ent.start_char,
|
239 |
+
"end": ent.end_char,
|
240 |
+
"text": ent.text,
|
241 |
+
"label": ent.label_,
|
242 |
+
"source": spacy_model,
|
243 |
+
"input_hash": eg[INPUT_HASH_ATTR],
|
244 |
+
}
|
245 |
+
)
|
246 |
+
task["spans"] = span_dicts
|
247 |
+
task[BINARY_ATTR] = False
|
248 |
+
task = set_hashes(task)
|
249 |
+
yield task
|
250 |
+
else:
|
251 |
+
yield from stream
|
252 |
+
|
253 |
+
|
254 |
+
def get_ner_labels(
|
255 |
+
nlp: Language, *, label: Optional[List[str]], component: str = "ner"
|
256 |
+
) -> Tuple[List[str], bool]:
|
257 |
+
model_labels = nlp.pipe_labels.get(component, [])
|
258 |
+
labels = get_pipe_labels(label, model_labels)
|
259 |
+
# Check if we're annotating all labels present in the model or a subset
|
260 |
+
no_missing = len(set(labels).intersection(set(model_labels))) == len(model_labels)
|
261 |
+
return labels, no_missing
|
262 |
+
|
263 |
+
|
264 |
+
def get_update(nlp: Language, *, no_missing: bool) -> Callable[[List[TaskType]], None]:
|
265 |
+
def update(answers: List[TaskType]) -> None:
|
266 |
+
log(f"RECIPE: Updating model with {len(answers)} answers")
|
267 |
+
examples = []
|
268 |
+
for eg in answers:
|
269 |
+
if eg["answer"] == "accept":
|
270 |
+
doc = make_raw_doc(nlp, eg)
|
271 |
+
ref = make_raw_doc(nlp, eg)
|
272 |
+
spans = [
|
273 |
+
doc.char_span(span["start"], span["end"], label=span["label"])
|
274 |
+
for span in eg.get("spans", [])
|
275 |
+
]
|
276 |
+
value = SetEntsDefault.outside if no_missing else SetEntsDefault.missing
|
277 |
+
ref.set_ents(spans, default=value)
|
278 |
+
examples.append(Example(doc, ref))
|
279 |
+
nlp.update(examples)
|
280 |
+
|
281 |
+
return update
|