etweedy commited on
Commit
d034971
·
1 Parent(s): 013eb2e

Upload 8 files

Browse files
lib/.DS_Store ADDED
Binary file (6.15 kB). View file
 
lib/.ipynb_checkpoints/utils-checkpoint.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from scipy.special import softmax
3
+ import collections
4
+ import torch
5
+ from torch.utils.data import DataLoader
6
+ from transformers import default_data_collator
7
+
8
+ def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
9
+ """
10
+ Preprocesses and tokenizes examples in preparation for inference
11
+
12
+ Parameters:
13
+ -----------
14
+ examples : datasets.Dataset
15
+ The dataset of examples. Must have columns:
16
+ 'id', 'question', 'context'
17
+ tokenizer : transformers.AutoTokenizer
18
+ The tokenizer for the model
19
+ max_length : int
20
+ The max length for context truncation
21
+ stride : int
22
+ The stride for context truncation
23
+
24
+ Returns:
25
+ --------
26
+ inputs : dict
27
+ The tokenized and processed data dictionary with
28
+ keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
29
+ All values are lists of length = # of inputs output by tokenizer
30
+ inputs['input_ids'][k] : list
31
+ token ids corresponding to tokens in feature k
32
+ inputs['attention_mask'][k] : list
33
+ attention mask for feature k
34
+ inputs['offset_ids'][k] : list
35
+ offset ids for feature k
36
+ inputs['example_id'][k] : int
37
+ id of example from which feature k originated
38
+ """
39
+ questions = [q.strip() for q in examples["question"]]
40
+ inputs = tokenizer(
41
+ questions,
42
+ examples['context'],
43
+ max_length=max_length,
44
+ truncation="only_second",
45
+ stride=stride,
46
+ return_overflowing_tokens=True,
47
+ return_offsets_mapping=True,
48
+ padding="max_length",
49
+ )
50
+
51
+ sample_map = inputs.pop("overflow_to_sample_mapping")
52
+ example_ids = []
53
+
54
+ for i in range(len(inputs["input_ids"])):
55
+ sample_idx = sample_map[i]
56
+ example_ids.append(examples["id"][sample_idx])
57
+
58
+ sequence_ids = inputs.sequence_ids(i)
59
+ offset = inputs["offset_mapping"][i]
60
+ inputs["offset_mapping"][i] = [
61
+ o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
62
+ ]
63
+
64
+ inputs["example_id"] = example_ids
65
+ return inputs
66
+
67
+
68
+ def make_predictions(model,tokenizer,inputs,examples,
69
+ n_best = 20,max_answer_length=30):
70
+ """
71
+ Generates a list of prediction data based on logits
72
+
73
+ Parameters:
74
+ -----------
75
+ model : transformers.AutoModelForQuestionAnswering
76
+ The trained model
77
+ tokenizer : transformers.AutoTokenizer
78
+ The model's tokenizer
79
+ inputs : dict
80
+ The tokenized and processed data dictionary with
81
+ keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
82
+ All values are lists of length = # of inputs output by tokenizer
83
+ inputs['input_ids'][k] : list
84
+ token ids corresponding to tokens in feature k
85
+ inputs['attention_mask'][k] : list
86
+ attention mask for feature k
87
+ inputs['offset_ids'][k] : list
88
+ offset ids for feature k
89
+ inputs['example_id'][k] : int
90
+ id of example from which feature k originated
91
+ examples : datasets.Dataset
92
+ The dataset of examples. Must have columns:
93
+ 'id', 'question', 'context'
94
+ n_best : int
95
+ The number of top start/end (by logit) indices to consider
96
+ max_answer_length : int
97
+ The maximum length (in characters) allowed for a candidate answer
98
+
99
+ Returns:
100
+ --------
101
+ predicted_answers : list(dict)
102
+ predicted_answers[k] has keys 'id','prediction_text','confidence'
103
+ predicted_answers[k]['id'] : int
104
+ The unique id of the example
105
+ predicted_answers[k]['prediction_text'] : str
106
+ The predicted answer as a string
107
+ predicted_answers[k]['confidence'] : float
108
+ The predicted probability corresponding to the answer, i.e. the
109
+ corresponding output of a softmax function on logits
110
+ """
111
+ assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'
112
+
113
+ if torch.backends.mps.is_available():
114
+ device = "mps"
115
+ elif torch.cuda.us_available():
116
+ device = "cuda"
117
+ else:
118
+ device = "cpu"
119
+ data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
120
+ data_for_model.set_format("torch",device=device)
121
+ dl = DataLoader(
122
+ data_for_model,
123
+ collate_fn=default_data_collator,
124
+ batch_size=len(inputs)
125
+ )
126
+ model = model.to(device)
127
+ for batch in dl:
128
+ outputs = model(**batch)
129
+
130
+ start_logits = outputs.start_logits.cpu().detach().numpy()
131
+ end_logits = outputs.end_logits.cpu().detach().numpy()
132
+ example_to_inputs = collections.defaultdict(list)
133
+ for idx, feature in enumerate(inputs):
134
+ example_to_inputs[feature["example_id"]].append(idx)
135
+
136
+ predicted_answers = []
137
+ for example in examples:
138
+ example_id = example["id"]
139
+ context = example["context"]
140
+ answers = []
141
+
142
+ for feature_index in example_to_inputs[example_id]:
143
+ start_logit = start_logits[feature_index]
144
+ end_logit = end_logits[feature_index]
145
+ offsets = inputs[feature_index]['offset_mapping']
146
+
147
+ start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
148
+ end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()
149
+
150
+ for start_index in start_indices:
151
+ for end_index in end_indices:
152
+ # Skip answers with a length that is either < 0 or > max_answer_length.
153
+ if(
154
+ end_index < start_index
155
+ or end_index - start_index + 1 > max_answer_length
156
+ ):
157
+ continue
158
+
159
+ if (offsets[start_index] is None)^(offsets[end_index] is None):
160
+ continue
161
+ if (offsets[start_index] is None)&(offsets[end_index] is None):
162
+ answers.append(
163
+ {
164
+ "text": '',
165
+ "logit_score": start_logit[start_index] + end_logit[end_index],
166
+ }
167
+ )
168
+ else:
169
+ answers.append(
170
+ {
171
+ "text": context[offsets[start_index][0] : offsets[end_index][1]],
172
+ "logit_score": start_logit[start_index] + end_logit[end_index],
173
+ }
174
+ )
175
+ answer_logits = [a['logit_score'] for a in answers]
176
+ answer_probs = softmax(answer_logits)
177
+
178
+ if len(answers)>0:
179
+ best_answer = max(answers, key=lambda x:x['logit_score'])
180
+ predicted_answers.append(
181
+ {'id':example_id, 'prediction_text':best_answer['text'], 'confidence':answer_probs[0]}
182
+ )
183
+ else:
184
+ predicted_answers.append({'id':example_id, 'prediction_text':'','confidence':answer_probs[0]})
185
+ for pred in predicted_answers:
186
+ if pred['prediction_text'] == '':
187
+ pred['prediction_text'] = "I don't have an answer based on the context provided."
188
+ return predicted_answers
lib/__init__.py ADDED
File without changes
lib/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (166 Bytes). View file
 
lib/__pycache__/utils.cpython-310.pyc ADDED
Binary file (6.13 kB). View file
 
lib/utils.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from scipy.special import softmax
3
+ import collections
4
+ import torch
5
+ from torch.utils.data import DataLoader
6
+ from transformers import default_data_collator
7
+
8
+ def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
9
+ """
10
+ Preprocesses and tokenizes examples in preparation for inference
11
+
12
+ Parameters:
13
+ -----------
14
+ examples : datasets.Dataset
15
+ The dataset of examples. Must have columns:
16
+ 'id', 'question', 'context'
17
+ tokenizer : transformers.AutoTokenizer
18
+ The tokenizer for the model
19
+ max_length : int
20
+ The max length for context truncation
21
+ stride : int
22
+ The stride for context truncation
23
+
24
+ Returns:
25
+ --------
26
+ inputs : dict
27
+ The tokenized and processed data dictionary with
28
+ keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
29
+ All values are lists of length = # of inputs output by tokenizer
30
+ inputs['input_ids'][k] : list
31
+ token ids corresponding to tokens in feature k
32
+ inputs['attention_mask'][k] : list
33
+ attention mask for feature k
34
+ inputs['offset_ids'][k] : list
35
+ offset ids for feature k
36
+ inputs['example_id'][k] : int
37
+ id of example from which feature k originated
38
+ """
39
+ questions = [q.strip() for q in examples["question"]]
40
+ inputs = tokenizer(
41
+ questions,
42
+ examples['context'],
43
+ max_length=max_length,
44
+ truncation="only_second",
45
+ stride=stride,
46
+ return_overflowing_tokens=True,
47
+ return_offsets_mapping=True,
48
+ padding="max_length",
49
+ )
50
+
51
+ sample_map = inputs.pop("overflow_to_sample_mapping")
52
+ example_ids = []
53
+
54
+ for i in range(len(inputs["input_ids"])):
55
+ sample_idx = sample_map[i]
56
+ example_ids.append(examples["id"][sample_idx])
57
+
58
+ sequence_ids = inputs.sequence_ids(i)
59
+ offset = inputs["offset_mapping"][i]
60
+ inputs["offset_mapping"][i] = [
61
+ o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
62
+ ]
63
+
64
+ inputs["example_id"] = example_ids
65
+ return inputs
66
+
67
+
68
+ def make_predictions(model,tokenizer,inputs,examples,
69
+ n_best = 20,max_answer_length=30):
70
+ """
71
+ Generates a list of prediction data based on logits
72
+
73
+ Parameters:
74
+ -----------
75
+ model : transformers.AutoModelForQuestionAnswering
76
+ The trained model
77
+ tokenizer : transformers.AutoTokenizer
78
+ The model's tokenizer
79
+ inputs : dict
80
+ The tokenized and processed data dictionary with
81
+ keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
82
+ All values are lists of length = # of inputs output by tokenizer
83
+ inputs['input_ids'][k] : list
84
+ token ids corresponding to tokens in feature k
85
+ inputs['attention_mask'][k] : list
86
+ attention mask for feature k
87
+ inputs['offset_ids'][k] : list
88
+ offset ids for feature k
89
+ inputs['example_id'][k] : int
90
+ id of example from which feature k originated
91
+ examples : datasets.Dataset
92
+ The dataset of examples. Must have columns:
93
+ 'id', 'question', 'context'
94
+ n_best : int
95
+ The number of top start/end (by logit) indices to consider
96
+ max_answer_length : int
97
+ The maximum length (in characters) allowed for a candidate answer
98
+
99
+ Returns:
100
+ --------
101
+ predicted_answers : list(dict)
102
+ predicted_answers[k] has keys 'id','prediction_text','confidence'
103
+ predicted_answers[k]['id'] : int
104
+ The unique id of the example
105
+ predicted_answers[k]['prediction_text'] : str
106
+ The predicted answer as a string
107
+ predicted_answers[k]['confidence'] : float
108
+ The predicted probability corresponding to the answer, i.e. the
109
+ corresponding output of a softmax function on logits
110
+ """
111
+ assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'
112
+
113
+ if torch.backends.mps.is_available():
114
+ device = "mps"
115
+ elif torch.cuda.us_available():
116
+ device = "cuda"
117
+ else:
118
+ device = "cpu"
119
+ data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
120
+ data_for_model.set_format("torch",device=device)
121
+ dl = DataLoader(
122
+ data_for_model,
123
+ collate_fn=default_data_collator,
124
+ batch_size=len(inputs)
125
+ )
126
+ model = model.to(device)
127
+ for batch in dl:
128
+ outputs = model(**batch)
129
+
130
+ start_logits = outputs.start_logits.cpu().detach().numpy()
131
+ end_logits = outputs.end_logits.cpu().detach().numpy()
132
+ example_to_inputs = collections.defaultdict(list)
133
+ for idx, feature in enumerate(inputs):
134
+ example_to_inputs[feature["example_id"]].append(idx)
135
+
136
+ predicted_answers = []
137
+ for example in examples:
138
+ example_id = example["id"]
139
+ context = example["context"]
140
+ answers = []
141
+
142
+ for feature_index in example_to_inputs[example_id]:
143
+ start_logit = start_logits[feature_index]
144
+ end_logit = end_logits[feature_index]
145
+ offsets = inputs[feature_index]['offset_mapping']
146
+
147
+ start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
148
+ end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()
149
+
150
+ for start_index in start_indices:
151
+ for end_index in end_indices:
152
+ # Skip answers with a length that is either < 0 or > max_answer_length.
153
+ if(
154
+ end_index < start_index
155
+ or end_index - start_index + 1 > max_answer_length
156
+ ):
157
+ continue
158
+
159
+ if (offsets[start_index] is None)^(offsets[end_index] is None):
160
+ continue
161
+ if (offsets[start_index] is None)&(offsets[end_index] is None):
162
+ answers.append(
163
+ {
164
+ "text": '',
165
+ "logit_score": start_logit[start_index] + end_logit[end_index],
166
+ }
167
+ )
168
+ else:
169
+ answers.append(
170
+ {
171
+ "text": context[offsets[start_index][0] : offsets[end_index][1]],
172
+ "logit_score": start_logit[start_index] + end_logit[end_index],
173
+ }
174
+ )
175
+ answer_logits = [a['logit_score'] for a in answers]
176
+ answer_probs = softmax(answer_logits)
177
+
178
+ if len(answers)>0:
179
+ best_answer = max(answers, key=lambda x:x['logit_score'])
180
+ predicted_answers.append(
181
+ {'id':example_id, 'prediction_text':best_answer['text'], 'confidence':answer_probs[0]}
182
+ )
183
+ else:
184
+ predicted_answers.append({'id':example_id, 'prediction_text':'','confidence':answer_probs[0]})
185
+ for pred in predicted_answers:
186
+ if pred['prediction_text'] == '':
187
+ pred['prediction_text'] = "I don't have an answer based on the context provided."
188
+ return predicted_answers
requirements.txt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.4
2
+ aiosignal==1.3.1
3
+ altair==5.0.1
4
+ async-timeout==4.0.2
5
+ attrs==23.1.0
6
+ blinker==1.6.2
7
+ cachetools==5.3.1
8
+ certifi==2023.5.7
9
+ charset-normalizer==3.1.0
10
+ click==8.1.4
11
+ datasets==2.13.1
12
+ decorator==5.1.1
13
+ dill==0.3.6
14
+ filelock==3.12.2
15
+ frozenlist==1.3.3
16
+ fsspec==2023.6.0
17
+ gitdb==4.0.10
18
+ GitPython==3.1.31
19
+ huggingface-hub==0.16.2
20
+ idna==3.4
21
+ importlib-metadata==6.7.0
22
+ Jinja2==3.1.2
23
+ jsonschema==4.18.0
24
+ jsonschema-specifications==2023.6.1
25
+ markdown-it-py==3.0.0
26
+ MarkupSafe==2.1.3
27
+ mdurl==0.1.2
28
+ mpmath==1.3.0
29
+ multidict==6.0.4
30
+ multiprocess==0.70.14
31
+ networkx==3.1
32
+ numpy==1.25.0
33
+ packaging==23.1
34
+ pandas==2.0.3
35
+ Pillow==9.5.0
36
+ protobuf==4.23.3
37
+ pyarrow==12.0.1
38
+ pydeck==0.8.1b0
39
+ Pygments==2.15.1
40
+ Pympler==1.0.1
41
+ python-dateutil==2.8.2
42
+ pytz==2023.3
43
+ pytz-deprecation-shim==0.1.0.post0
44
+ PyYAML==6.0
45
+ referencing==0.29.1
46
+ regex==2023.6.3
47
+ requests==2.31.0
48
+ rich==13.4.2
49
+ rpds-py==0.8.8
50
+ safetensors==0.3.1
51
+ scipy==1.11.1
52
+ six==1.16.0
53
+ smmap==5.0.0
54
+ streamlit==1.24.0
55
+ sympy==1.12
56
+ tenacity==8.2.2
57
+ tokenizers==0.13.3
58
+ toml==0.10.2
59
+ toolz==0.12.0
60
+ torch==2.0.1
61
+ tornado==6.3.2
62
+ tqdm==4.65.0
63
+ transformers==4.30.2
64
+ typing_extensions==4.7.1
65
+ tzdata==2023.3
66
+ tzlocal==4.3.1
67
+ urllib3==2.0.3
68
+ validators==0.20.0
69
+ xxhash==3.2.0
70
+ yarl==1.9.2
71
+ zipp==3.15.0
roberta_app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import streamlit as st
3
+ from datasets import Dataset
4
+ from torch.utils.data import DataLoader
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ AutoModelForQuestionAnswering,
8
+ TrainingArguments,
9
+ Trainer,
10
+ default_data_collator,
11
+ )
12
+ from lib.utils import preprocess_examples, make_predictions
13
+
14
+ if torch.backends.mps.is_available():
15
+ device = "mps"
16
+ elif torch.cuda.is_available():
17
+ device = "cuda"
18
+ else:
19
+ device = "cpu"
20
+
21
+ # TO DO:
22
+ # - make it pretty
23
+ # - add support for multiple questions corresponding to same context
24
+ # - add examples
25
+ # What else??
26
+
27
+
28
+ if 'response' not in st.session_state:
29
+ st.session_state['response'] = ''
30
+ if 'context' not in st.session_state:
31
+ st.session_state['context'] = ''
32
+ if 'question' not in st.session_state:
33
+ st.session_state['question'] = ''
34
+
35
+ # Build trainer using model and tokenizer from Hugging Face repo
36
+ @st.cache_resource(show_spinner=False)
37
+ def get_model():
38
+ repo_id = 'etweedy/roberta-base-squad-v2'
39
+ model = AutoModelForQuestionAnswering.from_pretrained(repo_id)
40
+ tokenizer = AutoTokenizer.from_pretrained(repo_id)
41
+ return model, tokenizer
42
+
43
+ with st.spinner('Loading the model...'):
44
+ model, tokenizer = get_model()
45
+
46
+ input_container = st.container()
47
+ st.divider()
48
+ response_container = st.container()
49
+
50
+ # Form for user inputs
51
+ with input_container:
52
+ with st.form(key='input_form',clear_on_submit=False):
53
+ context = st.text_area(
54
+ label='Context',
55
+ value='',
56
+ key='context_field',
57
+ label_visibility='hidden',
58
+ placeholder='Enter your context paragraph here.',
59
+ height=300,
60
+ )
61
+ question = st.text_input(
62
+ label='Question',
63
+ value='',
64
+ key='question_field',
65
+ label_visibility='hidden',
66
+ placeholder='Enter your question here.',
67
+ )
68
+ query_submitted = st.form_submit_button("Submit")
69
+ if query_submitted:
70
+ with st.spinner('Generating response...'):
71
+ data_raw = Dataset.from_dict(
72
+ {
73
+ 'id':[0],
74
+ 'context':[context],
75
+ 'question':[question]
76
+ }
77
+ )
78
+ data_proc = data_raw.map(
79
+ preprocess_examples,
80
+ remove_columns = data_raw.column_names,
81
+ batched = True,
82
+ fn_kwargs = {
83
+ 'tokenizer':tokenizer,
84
+ }
85
+ )
86
+ predicted_answers = make_predictions(model, tokenizer,
87
+ data_proc, data_raw,
88
+ n_best = 20)
89
+ answer = predicted_answers[0]['prediction_text']
90
+ confidence = predicted_answers[0]['confidence']
91
+ st.session_state['response'] = f"""
92
+ Answer: {answer}\n
93
+ Confidence: {confidence:.2%}
94
+ """
95
+ with response_container:
96
+ st.write(st.session_state['response'])
97
+