Spaces:

etweedy
/

roberta-squad-v2

Runtime error

App Files Files Community

etweedy commited on Jul 6, 2023

Commit

d034971

1 Parent(s): 013eb2e

Upload 8 files

Browse files

Files changed (8) hide show

lib/.DS_Store +0 -0
lib/.ipynb_checkpoints/utils-checkpoint.py +188 -0
lib/__init__.py +0 -0
lib/__pycache__/__init__.cpython-310.pyc +0 -0
lib/__pycache__/utils.cpython-310.pyc +0 -0
lib/utils.py +188 -0
requirements.txt +71 -0
roberta_app.py +97 -0

lib/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

lib/.ipynb_checkpoints/utils-checkpoint.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import numpy as np
+from scipy.special import softmax
+import collections
+import torch
+from torch.utils.data import DataLoader
+from transformers import default_data_collator
+def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
+    """
+    Preprocesses and tokenizes examples in preparation for inference
+    Parameters:
+    -----------
+    examples : datasets.Dataset
+        The dataset of examples.  Must have columns:
+        'id', 'question', 'context'
+    tokenizer : transformers.AutoTokenizer
+        The tokenizer for the model
+    max_length : int
+        The max length for context truncation
+    stride : int
+        The stride for context truncation
+    Returns:
+    --------
+    inputs : dict
+        The tokenized and processed data dictionary with
+        keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
+        All values are lists of length = # of inputs output by tokenizer
+            inputs['input_ids'][k] : list
+                token ids corresponding to tokens in feature k
+            inputs['attention_mask'][k] : list
+                attention mask for feature k
+            inputs['offset_ids'][k] : list
+                offset ids for feature k
+            inputs['example_id'][k] : int
+                id of example from which feature k originated
+    """
+    questions = [q.strip() for q in examples["question"]]
+    inputs = tokenizer(
+        questions,
+        examples['context'],
+        max_length=max_length,
+        truncation="only_second",
+        stride=stride,
+        return_overflowing_tokens=True,
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+    sample_map = inputs.pop("overflow_to_sample_mapping")
+    example_ids = []
+    for i in range(len(inputs["input_ids"])):
+        sample_idx = sample_map[i]
+        example_ids.append(examples["id"][sample_idx])
+        sequence_ids = inputs.sequence_ids(i)
+        offset = inputs["offset_mapping"][i]
+        inputs["offset_mapping"][i] = [
+            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
+        ]
+    inputs["example_id"] = example_ids
+    return inputs
+def make_predictions(model,tokenizer,inputs,examples,
+                    n_best = 20,max_answer_length=30):
+    """
+    Generates a list of prediction data based on logits
+    Parameters:
+    -----------
+    model : transformers.AutoModelForQuestionAnswering
+        The trained model
+    tokenizer : transformers.AutoTokenizer
+        The model's tokenizer
+    inputs : dict
+        The tokenized and processed data dictionary with
+        keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
+        All values are lists of length = # of inputs output by tokenizer
+            inputs['input_ids'][k] : list
+                token ids corresponding to tokens in feature k
+            inputs['attention_mask'][k] : list
+                attention mask for feature k
+            inputs['offset_ids'][k] : list
+                offset ids for feature k
+            inputs['example_id'][k] : int
+                id of example from which feature k originated
+    examples : datasets.Dataset
+        The dataset of examples.  Must have columns:
+        'id', 'question', 'context'
+    n_best : int
+        The number of top start/end (by logit) indices to consider
+    max_answer_length : int
+        The maximum length (in characters) allowed for a candidate answer
+    Returns:
+    --------
+    predicted_answers : list(dict)
+        predicted_answers[k] has keys 'id','prediction_text','confidence'
+        predicted_answers[k]['id'] : int
+            The unique id of the example
+        predicted_answers[k]['prediction_text'] : str
+            The predicted answer as a string
+        predicted_answers[k]['confidence'] : float
+            The predicted probability corresponding to the answer, i.e. the
+            corresponding output of a softmax function on logits
+    """
+    assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'
+    if torch.backends.mps.is_available():
+        device = "mps"
+    elif torch.cuda.us_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
+    data_for_model.set_format("torch",device=device)
+    dl = DataLoader(
+        data_for_model,
+        collate_fn=default_data_collator,
+        batch_size=len(inputs)
+    )
+    model = model.to(device)
+    for batch in dl:
+        outputs = model(**batch)
+    start_logits = outputs.start_logits.cpu().detach().numpy()
+    end_logits = outputs.end_logits.cpu().detach().numpy()
+    example_to_inputs = collections.defaultdict(list)
+    for idx, feature in enumerate(inputs):
+        example_to_inputs[feature["example_id"]].append(idx)
+    predicted_answers = []
+    for example in examples:
+        example_id = example["id"]
+        context = example["context"]
+        answers = []
+        for feature_index in example_to_inputs[example_id]:
+            start_logit = start_logits[feature_index]
+            end_logit = end_logits[feature_index]
+            offsets = inputs[feature_index]['offset_mapping']
+            start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
+            end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()
+            for start_index in start_indices:
+                for end_index in end_indices:
+                    # Skip answers with a length that is either < 0 or > max_answer_length.
+                    if(
+                        end_index < start_index
+                        or end_index - start_index + 1 > max_answer_length
+                    ):
+                        continue
+                    if (offsets[start_index] is None)^(offsets[end_index] is None):
+                        continue
+                    if (offsets[start_index] is None)&(offsets[end_index] is None):
+                        answers.append(
+                            {
+                                    "text": '',
+                                    "logit_score": start_logit[start_index] + end_logit[end_index],
+                            }
+                        )
+                    else:
+                        answers.append(
+                            {
+                                "text": context[offsets[start_index][0] : offsets[end_index][1]],
+                                "logit_score": start_logit[start_index] + end_logit[end_index],
+                            }
+                        )
+            answer_logits = [a['logit_score'] for a in answers]
+            answer_probs = softmax(answer_logits)
+            if len(answers)>0:
+                best_answer = max(answers, key=lambda x:x['logit_score'])
+                predicted_answers.append(
+                    {'id':example_id, 'prediction_text':best_answer['text'], 'confidence':answer_probs[0]}
+                )
+            else:
+                predicted_answers.append({'id':example_id, 'prediction_text':'','confidence':answer_probs[0]})
+            for pred in predicted_answers:
+                if pred['prediction_text'] == '':
+                    pred['prediction_text'] = "I don't have an answer based on the context provided."
+    return predicted_answers

lib/__init__.py ADDED Viewed

File without changes

lib/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (166 Bytes). View file

lib/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (6.13 kB). View file

lib/utils.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import numpy as np
+from scipy.special import softmax
+import collections
+import torch
+from torch.utils.data import DataLoader
+from transformers import default_data_collator
+def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
+    """
+    Preprocesses and tokenizes examples in preparation for inference
+    Parameters:
+    -----------
+    examples : datasets.Dataset
+        The dataset of examples.  Must have columns:
+        'id', 'question', 'context'
+    tokenizer : transformers.AutoTokenizer
+        The tokenizer for the model
+    max_length : int
+        The max length for context truncation
+    stride : int
+        The stride for context truncation
+    Returns:
+    --------
+    inputs : dict
+        The tokenized and processed data dictionary with
+        keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
+        All values are lists of length = # of inputs output by tokenizer
+            inputs['input_ids'][k] : list
+                token ids corresponding to tokens in feature k
+            inputs['attention_mask'][k] : list
+                attention mask for feature k
+            inputs['offset_ids'][k] : list
+                offset ids for feature k
+            inputs['example_id'][k] : int
+                id of example from which feature k originated
+    """
+    questions = [q.strip() for q in examples["question"]]
+    inputs = tokenizer(
+        questions,
+        examples['context'],
+        max_length=max_length,
+        truncation="only_second",
+        stride=stride,
+        return_overflowing_tokens=True,
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+    sample_map = inputs.pop("overflow_to_sample_mapping")
+    example_ids = []
+    for i in range(len(inputs["input_ids"])):
+        sample_idx = sample_map[i]
+        example_ids.append(examples["id"][sample_idx])
+        sequence_ids = inputs.sequence_ids(i)
+        offset = inputs["offset_mapping"][i]
+        inputs["offset_mapping"][i] = [
+            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
+        ]
+    inputs["example_id"] = example_ids
+    return inputs
+def make_predictions(model,tokenizer,inputs,examples,
+                    n_best = 20,max_answer_length=30):
+    """
+    Generates a list of prediction data based on logits
+    Parameters:
+    -----------
+    model : transformers.AutoModelForQuestionAnswering
+        The trained model
+    tokenizer : transformers.AutoTokenizer
+        The model's tokenizer
+    inputs : dict
+        The tokenized and processed data dictionary with
+        keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
+        All values are lists of length = # of inputs output by tokenizer
+            inputs['input_ids'][k] : list
+                token ids corresponding to tokens in feature k
+            inputs['attention_mask'][k] : list
+                attention mask for feature k
+            inputs['offset_ids'][k] : list
+                offset ids for feature k
+            inputs['example_id'][k] : int
+                id of example from which feature k originated
+    examples : datasets.Dataset
+        The dataset of examples.  Must have columns:
+        'id', 'question', 'context'
+    n_best : int
+        The number of top start/end (by logit) indices to consider
+    max_answer_length : int
+        The maximum length (in characters) allowed for a candidate answer
+    Returns:
+    --------
+    predicted_answers : list(dict)
+        predicted_answers[k] has keys 'id','prediction_text','confidence'
+        predicted_answers[k]['id'] : int
+            The unique id of the example
+        predicted_answers[k]['prediction_text'] : str
+            The predicted answer as a string
+        predicted_answers[k]['confidence'] : float
+            The predicted probability corresponding to the answer, i.e. the
+            corresponding output of a softmax function on logits
+    """
+    assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'
+    if torch.backends.mps.is_available():
+        device = "mps"
+    elif torch.cuda.us_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
+    data_for_model.set_format("torch",device=device)
+    dl = DataLoader(
+        data_for_model,
+        collate_fn=default_data_collator,
+        batch_size=len(inputs)
+    )
+    model = model.to(device)
+    for batch in dl:
+        outputs = model(**batch)
+    start_logits = outputs.start_logits.cpu().detach().numpy()
+    end_logits = outputs.end_logits.cpu().detach().numpy()
+    example_to_inputs = collections.defaultdict(list)
+    for idx, feature in enumerate(inputs):
+        example_to_inputs[feature["example_id"]].append(idx)
+    predicted_answers = []
+    for example in examples:
+        example_id = example["id"]
+        context = example["context"]
+        answers = []
+        for feature_index in example_to_inputs[example_id]:
+            start_logit = start_logits[feature_index]
+            end_logit = end_logits[feature_index]
+            offsets = inputs[feature_index]['offset_mapping']
+            start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
+            end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()
+            for start_index in start_indices:
+                for end_index in end_indices:
+                    # Skip answers with a length that is either < 0 or > max_answer_length.
+                    if(
+                        end_index < start_index
+                        or end_index - start_index + 1 > max_answer_length
+                    ):
+                        continue
+                    if (offsets[start_index] is None)^(offsets[end_index] is None):
+                        continue
+                    if (offsets[start_index] is None)&(offsets[end_index] is None):
+                        answers.append(
+                            {
+                                    "text": '',
+                                    "logit_score": start_logit[start_index] + end_logit[end_index],
+                            }
+                        )
+                    else:
+                        answers.append(
+                            {
+                                "text": context[offsets[start_index][0] : offsets[end_index][1]],
+                                "logit_score": start_logit[start_index] + end_logit[end_index],
+                            }
+                        )
+            answer_logits = [a['logit_score'] for a in answers]
+            answer_probs = softmax(answer_logits)
+            if len(answers)>0:
+                best_answer = max(answers, key=lambda x:x['logit_score'])
+                predicted_answers.append(
+                    {'id':example_id, 'prediction_text':best_answer['text'], 'confidence':answer_probs[0]}
+                )
+            else:
+                predicted_answers.append({'id':example_id, 'prediction_text':'','confidence':answer_probs[0]})
+            for pred in predicted_answers:
+                if pred['prediction_text'] == '':
+                    pred['prediction_text'] = "I don't have an answer based on the context provided."
+    return predicted_answers

requirements.txt ADDED Viewed

	@@ -0,0 +1,71 @@

+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==5.0.1
+async-timeout==4.0.2
+attrs==23.1.0
+blinker==1.6.2
+cachetools==5.3.1
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.4
+datasets==2.13.1
+decorator==5.1.1
+dill==0.3.6
+filelock==3.12.2
+frozenlist==1.3.3
+fsspec==2023.6.0
+gitdb==4.0.10
+GitPython==3.1.31
+huggingface-hub==0.16.2
+idna==3.4
+importlib-metadata==6.7.0
+Jinja2==3.1.2
+jsonschema==4.18.0
+jsonschema-specifications==2023.6.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.14
+networkx==3.1
+numpy==1.25.0
+packaging==23.1
+pandas==2.0.3
+Pillow==9.5.0
+protobuf==4.23.3
+pyarrow==12.0.1
+pydeck==0.8.1b0
+Pygments==2.15.1
+Pympler==1.0.1
+python-dateutil==2.8.2
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+referencing==0.29.1
+regex==2023.6.3
+requests==2.31.0
+rich==13.4.2
+rpds-py==0.8.8
+safetensors==0.3.1
+scipy==1.11.1
+six==1.16.0
+smmap==5.0.0
+streamlit==1.24.0
+sympy==1.12
+tenacity==8.2.2
+tokenizers==0.13.3
+toml==0.10.2
+toolz==0.12.0
+torch==2.0.1
+tornado==6.3.2
+tqdm==4.65.0
+transformers==4.30.2
+typing_extensions==4.7.1
+tzdata==2023.3
+tzlocal==4.3.1
+urllib3==2.0.3
+validators==0.20.0
+xxhash==3.2.0
+yarl==1.9.2
+zipp==3.15.0

roberta_app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+import streamlit as st
+from datasets import Dataset
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoTokenizer,
+    AutoModelForQuestionAnswering,
+    TrainingArguments,
+    Trainer,
+    default_data_collator,
+)
+from lib.utils import preprocess_examples, make_predictions
+if torch.backends.mps.is_available():
+    device = "mps"
+elif torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
+# TO DO:
+# - make it pretty
+# - add support for multiple questions corresponding to same context
+# - add examples
+# What else??
+if 'response' not in st.session_state:
+    st.session_state['response'] = ''
+if 'context' not in st.session_state:
+    st.session_state['context'] = ''
+if 'question' not in st.session_state:
+    st.session_state['question'] = ''
+# Build trainer using model and tokenizer from Hugging Face repo
+@st.cache_resource(show_spinner=False)
+def get_model():
+    repo_id = 'etweedy/roberta-base-squad-v2'
+    model = AutoModelForQuestionAnswering.from_pretrained(repo_id)
+    tokenizer = AutoTokenizer.from_pretrained(repo_id)
+    return model, tokenizer
+with st.spinner('Loading the model...'):
+    model, tokenizer = get_model()
+input_container = st.container()
+st.divider()
+response_container = st.container()
+# Form for user inputs
+with input_container:
+    with st.form(key='input_form',clear_on_submit=False):
+        context = st.text_area(
+            label='Context',
+            value='',
+            key='context_field',
+            label_visibility='hidden',
+            placeholder='Enter your context paragraph here.',
+            height=300,
+        )
+        question = st.text_input(
+            label='Question',
+            value='',
+            key='question_field',
+            label_visibility='hidden',
+            placeholder='Enter your question here.',
+        )
+        query_submitted = st.form_submit_button("Submit")
+        if query_submitted:
+            with st.spinner('Generating response...'):
+                data_raw = Dataset.from_dict(
+                    {
+                        'id':[0],
+                        'context':[context],
+                        'question':[question]
+                    }
+                )
+                data_proc = data_raw.map(
+                    preprocess_examples,
+                    remove_columns = data_raw.column_names,
+                    batched = True,
+                    fn_kwargs = {
+                        'tokenizer':tokenizer,
+                    }
+                )
+                predicted_answers = make_predictions(model, tokenizer,
+                                                    data_proc, data_raw,
+                                                    n_best = 20)
+                answer = predicted_answers[0]['prediction_text']
+                confidence = predicted_answers[0]['confidence']
+                st.session_state['response'] = f"""
+                    Answer: {answer}\n
+                    Confidence: {confidence:.2%}
+                """
+with response_container:
+    st.write(st.session_state['response'])