|
import os
|
|
from datasets import Sequence, Value, Features
|
|
from datasets import Dataset, DatasetDict
|
|
|
|
EXAMPLE_FEATURES = Features(
|
|
{
|
|
"guid": Value(dtype="string", id=None),
|
|
"question": Value(dtype="string", id=None),
|
|
"context": Value(dtype="string", id=None),
|
|
"answers": Sequence(
|
|
feature={
|
|
"text": Value(dtype="string", id=None),
|
|
"answer_start": Value(dtype="int32", id=None),
|
|
},
|
|
),
|
|
"is_impossible": Value(dtype="bool", id=None),
|
|
"title": Value(dtype="string", id=None),
|
|
"classtype": Value(dtype="string", id=None),
|
|
"source": Value(dtype="string", id=None),
|
|
"dataset": Value(dtype="string", id=None),
|
|
}
|
|
)
|
|
|
|
SKETCH_TRAIN_FEATURES = Features(
|
|
{
|
|
"input_ids": Sequence(feature=Value(dtype='int32', id=None)),
|
|
"attention_mask": Sequence(feature=Value(dtype='int8', id=None)),
|
|
"token_type_ids": Sequence(feature=Value(dtype='int8', id=None)),
|
|
"labels": Value(dtype='int64', id=None),
|
|
}
|
|
)
|
|
|
|
SKETCH_EVAL_FEATURES = Features(
|
|
{
|
|
"input_ids": Sequence(feature=Value(dtype='int32', id=None)),
|
|
"attention_mask": Sequence(feature=Value(dtype='int8', id=None)),
|
|
"token_type_ids": Sequence(feature=Value(dtype='int8', id=None)),
|
|
"labels": Value(dtype='int64', id=None),
|
|
"example_id": Value(dtype='string', id=None),
|
|
}
|
|
)
|
|
|
|
INTENSIVE_TRAIN_FEATUERS = Features(
|
|
{
|
|
"input_ids": Sequence(feature=Value(dtype='int32', id=None)),
|
|
"attention_mask": Sequence(feature=Value(dtype='int8', id=None)),
|
|
"token_type_ids": Sequence(feature=Value(dtype='int8', id=None)),
|
|
"start_positions": Value(dtype='int64', id=None),
|
|
"end_positions": Value(dtype='int64', id=None),
|
|
"is_impossibles": Value(dtype='float64', id=None),
|
|
}
|
|
)
|
|
|
|
INTENSIVE_EVAL_FEATUERS = Features(
|
|
{
|
|
"input_ids": Sequence(feature=Value(dtype='int32', id=None)),
|
|
"attention_mask": Sequence(feature=Value(dtype='int8', id=None)),
|
|
"token_type_ids": Sequence(feature=Value(dtype='int8', id=None)),
|
|
"offset_mapping": Sequence(
|
|
feature=Sequence(
|
|
feature=Value(dtype='int64', id=None)
|
|
)
|
|
),
|
|
"example_id": Value(dtype='string', id=None),
|
|
}
|
|
)
|
|
|
|
QUESTION_COLUMN_NAME = "question"
|
|
CONTEXT_COLUMN_NAME = "context"
|
|
ANSWER_COLUMN_NAME = "answers"
|
|
ANSWERABLE_COLUMN_NAME = "is_impossible"
|
|
ID_COLUMN_NAME = "guid"
|
|
|
|
SCORE_EXT_FILE_NAME = "cls_score.json"
|
|
INTENSIVE_PRED_FILE_NAME = "predictions.json"
|
|
NBEST_PRED_FILE_NAME = "nbest_predictions.json"
|
|
SCORE_DIFF_FILE_NAME = "null_odds.json"
|
|
|
|
DEFAULT_CONFIG_FILE = os.path.join(
|
|
os.path.realpath(__file__), "args/default_config.yaml"
|
|
)
|
|
|
|
KO_QUERY_HELP_TEXT = "์ง๋ฌธ์ ์
๋ ฅํด์ฃผ์ธ์!"
|
|
KO_CONTEXT_HELP_TEXT = "๋ฌธ๋งฅ์ ์
๋ ฅํด์ฃผ์ธ์!"
|
|
|
|
EN_QUERY_HELP_TEXT = "Plz enter your question!"
|
|
EN_CONTEXT_HELP_TEXT = "Plz enter your context!"
|
|
|
|
KO_EXAMPLE_QUERY = "์ด์์ ์ ์ด๋ ์๋์ ๋ฌด์ ์ด์ผ?"
|
|
KO_EXAMPLE_CONTEXTS = """
|
|
16์ธ๊ธฐ ์กฐ์ ์ ๋ฌด์ ์ผ๋ก, ์ผ๋ณธ์ด ์กฐ์ ์ ์นจ๊ณตํ์ฌ ์ผ์ด๋ ์ ์์ธ ์์ง์๋ ๋น์ ์กฐ์ ์๊ตฐ์ ํต์ํ๋ ์ ๋
์ด์ ๊ตฌ๊ตญ์์
์ด๋ค.
|
|
|
|
์นจ๋ต๊ตฐ๊ณผ ๊ต์ ํ์ฌ ์ฒ์ฌ์ ์ธ ํ์ฝ์์ ํผ์น๊ณ ์ค์ ์ง์ ์์ด ์๊ธ์์กฑ์ ํด๋ธ ๊ตฐ ์งํ๊ด์ด์, ํํ ์ธ์ฌ๋ค์๊ฒ ๋ฒ์ ๋ฐ๋ฅธ ์์น์ ์๊ตฌํ๋ฉด์๋ ๋๋ ทํ ์ฑ๊ณต๋ฅ ๊ณผ ๋ถ์กฑํจ ์๋ ์ฒ์ฐ๋ฅผ ๋ณด์ฅํ ์๊ด, ์ง๋ฐฉ๊ด ์์ ๋ฐฑ์ฑ๋ค์๊ฒ ์ ์ ์ ๋ฒ ํ๊ณ ์ ์์๋ ๊ทธ๋ค์ ์๋ฌดํ๊ณ ๊ตฌ์ ํ ๋ชฉ๋ฏผ๊ด, ๊ณ ์ ๊ด๋ฃ์ ์ ์ ๋ฐ ์ถ์ฌ๋ฅผ ๊ฑฐ๋ถํ๊ณ ๊ณต์ ๊ณผ ๊ตญ์ต, ์ ์ ๋ฅผ ์ค์ํ ์ธ๊ฒฉ์, ์์ ์ด ๊ดํ ํ ์ง์ญ์ ๋ฐฑ์ฑ๊ณผ ๋ณ์ฌ์๊ฒ ๊ฐ์ข
์ฌ์
์ ์ฅ๋ คํ์ฌ ๋ง์ ์ํจ๋ฅผ ์ป์ด๋ธ ํ์ ๊ฐ, ๊ทธ๋ฆฌ๊ณ ์์ ์์ํ ์กฐ์ ์ ํ๋ฐ์ผ๋ก ์ฌํ์๊ฐ ๋๊ฑฐ๋ ํ์์์ ์ค์ฑ
์ผ๋ก ๊ตฐ์ฌยท๊ตฐ์ ๋ค์ ๊ฑฐ์ ์์คํ๊ฑฐ๋ ์ด๋จธ๋์ ์๋ค์ ์๋ ๋ฑ ๋ง์ ์๋์ ๊ฒช๊ณ ๋ ๋ช
๋ ํด์ ๋ฑ์ ์ํ๋ฉฐ ๊ตดํ์ง ์์ ์ฒ ์ธ์ ๋ฉด๋ชจ๊น์ง ๊ฐ์ถฐ ์กฐ์ ์ค๊ธฐ์ ๋ช
์ฅ์ ๋์ด ํ๊ตญ์ฌ ์ต๊ณ ์์ธ์ ๋ฐ์ด๊น์ง ์ค๋ฅธ ์ธ๋ฌผ์ด๋ค.
|
|
|
|
์์ ๋ถํฐ ๊ทธ๋ฅผ ์ฌ์ ์ผ๋ก ์๊ณ ์๋ ์ธ๊ทผ ๋ฐฑ์ฑ์ด๋ ๊ตฐ์กธ, ์ผ๋ถ ์ฅ์์ ์ฌ์๋ค๋ก๋ถํฐ ๋ฐ์ด๋ ์ธ๋ฌผ๋ก ํ๊ฐ๋ฐ์๊ณ ๊ทธ๋ ์ง ์๋๋ผ๋ ๋ช
์ฑ์ด ์ ๋ฒ ์์์ผ๋ฉฐ ์ ์ฌ ์์์ ๋ง์ ์ด๊ฐ ๋จ๋
๋
ธ์๋ฅผ ๋ถ๋ฌธํ๊ณ ํฌ๊ฒ ์ฌํผํ๋ค๊ณ ์ ํด์ง๋ค. ์ฌํ ์กฐ์ ์ ๊ด์ง์ ์ถ์ฆํ๊ณ ์ ๋น๋ค์ ์ฐฌ์์(่ฉฉ)๋ฅผ ์ง์์ผ๋ฉฐ ๋ฐฑ์ฑ๋ค์ ์ถ๋ชจ๋น๋ฅผ ์ธ์ฐ๋ ๋ฑ, ์ด์์ ์ ์ค๋๋๋ก ๋ง์ ์ถ์์ ๋ฐ์์๋ค. ์ด๋ ์ผ์ ๊ฐ์ ๊ธฐ๋ฅผ ๊ฑฐ์ณ ํ๋์๋ ๋ง์ฐฌ๊ฐ์ง๋ก, ์ด์์ ์ ๋ํ๋ฏผ๊ตญ ๊ตญ๋ฏผ๋ค์ด ๊ฐ์ฅ ์กด๊ฒฝํ๋ ์์ธ ์ค ํ ๋ช
์ผ๋ก ๊ผฝํ๋ฉฐ ํ๋ ํ๊ตญ์์ ์ฑ์
์ด๋ผ๋ ์ต์๊ธ ์์ฌ๊ฐ ์ด๋ฆ ์์ ๋ถ์ด๋ ์ด๋ค ์ด์๋ ์ ๊ธฐ๋ฐ์ง ์๋, ์ธ์ข
๊ณผ ํจ๊ป ํ๊ตญ์ธ์๊ฒ ๊ฐ์ฅ ์ฌ๋๋ฐ๋ ํ๊ตญ์ฌ ์๋ ์์ธ์ด๋ค. ๊ฐ์ฅ ์กด๊ฒฝํ๋ ์์ธ์ ๋ฌป๋ ์ค๋ฌธ์กฐ์ฌ์์๋ ์ธ์ข
๋์๊ณผ 1, 2์๋ฅผ ๋คํฌ๋ฉฐ ์ถฉ๋ฌด๊ณต์ด๋ผ๋ ์ํธ๋ ์ค์ ๋ก๋ ๊น์๋ฏผ๊ณผ ๊ฐ์ ์ฌ๋ฌ ์ฅ์๋ค์ด ๋ฐ์ ์ํธ์ด์ง๋ง ํ๋ ํ๊ตญ์ธ๋ค์ ์ด์์ ์ ์ฉ ์ํธ๋ก ์ธ์ํ๋ค.
|
|
""".strip()
|
|
|
|
EN_EXAMPLE_QUERY = "When did Beyonce start becoming popular?"
|
|
EN_EXAMPLE_CONTEXTS = """
|
|
Beyoncรฉ Giselle Knowles-Carter (/biหหjษnseษช/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncรฉ\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
|
|
""".strip() |