Spaces:

wing-nus
/

SciAssist

Running

App Files Files Community

test pull request

by dyxohjl666 - opened Aug 22, 2022

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

-1052

Files changed (16) hide show

README.md +5 -5
app.py +0 -188
controlled_summarization.py +0 -165
dataset_extraction.py +0 -44
description.py +0 -76
examples/127.txt +0 -499
examples/BERT - Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf +0 -0
examples/BERT_body.txt +0 -1
examples/BERT_paper.pdf +0 -0
examples/H01-1042.pdf +0 -0
examples/H01-1042_body.txt +0 -1
examples/N18-3011_body.txt +0 -1
examples/N18-3011_ref.txt +0 -27
reference_string_parsing.py +0 -36
requirements.txt +0 -6
scibert-uncased.pt +0 -3

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Test Sciassist
-emoji: 🚀
-colorFrom: red
-colorTo: red
 sdk: gradio
-sdk_version: 3.50.2
 app_file: app.py
 pinned: false
 license: afl-3.0

 ---
+title: SciAssist
+emoji: 📊
+colorFrom: blue
+colorTo: gray
 sdk: gradio
+sdk_version: 3.1.6
 app_file: app.py
 pinned: false
 license: afl-3.0

app.py DELETED Viewed

@@ -1,188 +0,0 @@
-import gradio as gr
-from description import *
-from reference_string_parsing import *
-from controlled_summarization import *
-from dataset_extraction import *
-from controlled_summarization import recommended_kw
-import requests
-# Example Usage
-# url = "https://arxiv.org/pdf/2305.14996.pdf"
-# dest_folder = "./examples/"
-# download_pdf(url, dest_folder)
-with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
-    gr.Markdown("# Gradio Demo for SciAssist")
-    with gr.Tabs():
-        # Controlled Summarization
-        with gr.TabItem("Controlled Summarization"):
-            with gr.Box():
-                gr.Markdown(ctrlsum_file_md)
-                with gr.Row():
-                    with gr.Column():
-                        ctrlsum_url = gr.Textbox(label="PDF URL", max_lines=1)
-                        ctrlsum_file = gr.File(label="Input File")
-                        ctrlsum_str = gr.TextArea(label="Input String", max_lines=5)
-                        with gr.Column():
-                            gr.Markdown("* Set the length of text used for summarization. Length 0 will exert no control over length.")
-                            # ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
-                            # ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
-                            ctrlsum_file_length = gr.Radio(label="Length", value=0, choices=[0, 50, 100, 200])
-                            kw = gr.Radio(visible=False)
-                            ctrlsum_file_keywords = gr.Textbox(label="Keywords", max_lines=1)
-                        with gr.Row():
-                            ctrlsum_file_btn = gr.Button("Generate")
-                    ctrlsum_file_output = gr.Textbox(
-                        elem_id="htext",
-                        label="Summary",
-                    )
-                ctrlsum_file_examples = gr.Examples(
-                    examples=[["examples/H01-1042_body.txt", 50, "automatic evaluation technique", "", ""],
-                              ["examples/H01-1042.pdf", 0, "automatic evaluation technique", "", ""]],
-                    inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str, ctrlsum_url
-                            ])
-        ctrlsum_file_btn.click(
-            fn=ctrlsum_for_file,
-            inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str, ctrlsum_url],
-            outputs=[ctrlsum_file_output, ctrlsum_str, ctrlsum_file]
-        )
-        def clear():
-            return None, 0, None, None, gr.Radio(visible=False)
-        def update_url(url):
-            if url in recommended_kw.keys():
-                keywords = recommended_kw[url]
-                if keywords != None:
-                    return None, None, gr.Radio(choices=keywords[:3], label="Recommended Keywords", visible=True,
-                                                interactive=True)
-            return None, None, gr.Radio(visible=False)
-        ctrlsum_file.upload(clear, inputs=None,
-                            outputs=[ctrlsum_str, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_url, kw])
-        ctrlsum_url.input(update_url, inputs=ctrlsum_url, outputs=[ctrlsum_str, ctrlsum_file, kw])
-        ctrlsum_str.input(clear, inputs=None,
-                          outputs=[ctrlsum_url, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_file, kw])
-        def select_kw(env: gr.SelectData):
-            return env.value
-        kw.select(select_kw, None, ctrlsum_file_keywords)
-        # Reference String Parsing
-        with gr.TabItem("Reference String Parsing"):
-            gr.Markdown(rsp_title_md)
-            with gr.Box():
-                gr.Markdown(rsp_str_md)
-                with gr.Row():
-                    with gr.Column():
-                        rsp_str = gr.Textbox(label="Input String")
-                        with gr.Column():
-                            rsp_str_dehyphen = gr.Checkbox(label="dehyphen")
-                        with gr.Row():
-                            rsp_str_btn = gr.Button("Parse")
-                    rsp_str_output = gr.HighlightedText(
-                        elem_id="htext",
-                        label="The Result of Parsing",
-                        combine_adjacent=True,
-                        adjacent_separator=" ",
-                    )
-                rsp_str_examples = gr.Examples(examples=[[
-                                                         "Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).",
-                                                         True],
-                                                     [
-                                                         "Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval-2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).",
-                                                         False]], inputs=[rsp_str, rsp_str_dehyphen])
-            with gr.Box():
-                gr.Markdown(rsp_file_md)
-                with gr.Row():
-                    with gr.Column():
-                        rsp_file = gr.File(label="Input File")
-                        rsp_file_dehyphen = gr.Checkbox(label="dehyphen")
-                        with gr.Row():
-                            rsp_file_btn = gr.Button("Parse")
-                    rsp_file_output = gr.HighlightedText(
-                        elem_id="htext",
-                        label="The Result of Parsing",
-                        combine_adjacent=True,
-                        adjacent_separator=" ",
-                    )
-                rsp_file_examples = gr.Examples(examples=[["examples/N18-3011_ref.txt", False],["examples/BERT_paper.pdf", True]], inputs=[rsp_file, rsp_file_dehyphen])
-        rsp_file_btn.click(
-            fn=rsp_for_file,
-            inputs=[rsp_file, rsp_file_dehyphen],
-            outputs=rsp_file_output
-        )
-        rsp_str_btn.click(
-            fn=rsp_for_str,
-            inputs=[rsp_str, rsp_str_dehyphen],
-            outputs=rsp_str_output
-        )
-        # Dataset Extraction
-        with gr.TabItem("Dataset Mentions Extraction"):
-            gr.Markdown(de_title_md)
-            with gr.Box():
-                gr.Markdown(de_str_md)
-                with gr.Row():
-                    with gr.Column():
-                        de_str = gr.Textbox(label="Input String")
-                        with gr.Row():
-                            de_str_btn = gr.Button("Extract")
-                    de_str_output = gr.HighlightedText(
-                        elem_id="htext",
-                        label="The Result of Extraction",
-                        combine_adjacent=True,
-                        adjacent_separator=" ",
-                    )
-                de_str_examples = gr.Examples(examples=[["The impact of gender identity on emotions was examined by researchers using a subsample from the National Longitudinal Study of Adolescent Health. The study aimed to investigate the direct effects of gender identity on emotional experiences and expression. By focusing on a subsample of the larger study, the researchers were able to hone in on the specific relationship between gender identity and emotions. Through their analysis, the researchers sought to determine whether gender identity could have a significant and direct impact on emotional well-being. The findings of the study have important implications for our understanding of the complex interplay between gender identity and emotional experiences, and may help to inform future interventions and support for individuals who experience gender-related emotional distress."],
-                                                        ["The possibility of genotype-environment interaction for memory performance and change was examined in 150 monozygotic twin pairs from the Swedish Adoption Twin Study of Aging and the National Comorbidity Survey. They aimed to explore how genetic and environmental factors could interact to affect cognitive performance in aging individuals. Through their analysis, the researchers hoped to gain a better understanding of the complex interplay between nature and nurture in determining cognitive outcomes. By investigating the unique characteristics of monozygotic twins, who share identical genetic material, the study was able to isolate the role of environmental factors in shaping cognitive abilities over time. The findings from this research have important implications for our understanding of the complex interplay between genetics and the environment in shaping cognitive outcomes in aging individuals."]],
-                                                         inputs=[de_str])
-            with gr.Box():
-                gr.Markdown(de_file_md)
-                with gr.Row():
-                    with gr.Column():
-                        de_file = gr.File(label="Input File")
-                        with gr.Row():
-                            de_file_btn = gr.Button("Extract")
-                    de_file_output = gr.HighlightedText(
-                        elem_id="htext",
-                        label="The Result of Extraction",
-                        combine_adjacent=True,
-                        adjacent_separator=" ",
-                    )
-                de_file_examples = gr.Examples(examples=[["examples/127.txt"]], inputs=[de_file])
-        de_file_btn.click(
-            fn=de_for_file,
-            inputs=[de_file],
-            outputs=de_file_output
-        )
-        de_str_btn.click(
-            fn=de_for_str,
-            inputs=[de_str],
-            outputs=de_str_output
-        )
-demo.launch(share=False)

controlled_summarization.py DELETED Viewed

@@ -1,165 +0,0 @@
-from typing import List, Tuple
-import torch
-from SciAssist import Summarization
-import os
-import requests
-from datasets import load_dataset
-print(f"Is CUDA available: {torch.cuda.is_available()}")
-# True
-if torch.cuda.is_available():
-    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
-    device = 'gpu'
-    ctrlsum_pipeline = Summarization(os_name="nt",model_name="flan-t5-xl",checkpoint="dyxohjl666/flant5-xl-cocoscisum",device=device)
-else:
-    device = 'cpu'
-    ctrlsum_pipeline = Summarization(os_name="nt",device=device)
-acl_dict = {}
-recommended_kw = {}
-acl_data = load_dataset("dyxohjl666/CocoScisum_ACL", revision="refs/convert/parquet")
-def convert_to_dict(data):
-    """ Dict:
-        { url:
-            {length:
-                {keywords: summary};
-             raw_text:
-                 str;
-            }
-        }
-    """
-    url = data["url"]
-    text = data["text"]
-    keywords = data["keywords"]
-    length = data["length"]
-    summary = data["summary"]
-    for u, t, k, l, s in zip(url, text, keywords, length, summary):
-        if len(u) < 5:
-            continue
-        u = u + ".pdf"
-        if k == None:
-            k = ""
-        if l == None:
-            l = ""
-        k = str(k).strip()
-        l = str(l).strip()
-        if u in acl_dict.keys():
-            if k in acl_dict[u][l].keys():
-                continue
-            else:
-                acl_dict[u][l][k] = s
-        else:
-            acl_dict[u] = {"": {}, "50": {}, "100": {}, "200": {}, "raw_text": t}
-        # kws
-        if u in recommended_kw.keys():
-            if k == "" or k in recommended_kw[u]:
-                continue
-            else:
-                recommended_kw[u].append(k)
-        else:
-            recommended_kw[u] = []
-    return 1
-for i in acl_data.keys():
-    signal = convert_to_dict(acl_data[i])
-def download_pdf(url, dest_folder):
-    """
-    Download a PDF from a given URL and save it to a specified destination folder.
-    Parameters:
-        url (str): URL of the PDF
-        dest_folder (str): Destination folder to save the downloaded PDF
-    """
-    if not os.path.exists(dest_folder):
-        os.makedirs(dest_folder)
-    response = requests.get(url, stream=True)
-    filename = os.path.join(dest_folder, url.split("/")[-1])
-    with open(filename, 'wb') as file:
-        for chunk in response.iter_content(chunk_size=1024):
-            if chunk:
-                file.write(chunk)
-    print(f"Downloaded {url} to {filename}")
-    return filename
-def ctrlsum_for_str(input, length=None, keywords=None) -> List[Tuple[str, str]]:
-    if keywords is not None:
-        keywords = keywords.strip().split(",")
-        if keywords[0] == "":
-            keywords = None
-    if length == 0 or length is None:
-        length = None
-    results = ctrlsum_pipeline.predict(input, type="str",
-                                       length=length, keywords=keywords, num_beams=1)
-    output = []
-    for res in results["summary"]:
-        output.append(f"{res}\n\n")
-    return "".join(output)
-def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]:
-    if input == None and url == "":
-        if text == "":
-            return None, "Input cannot be left blank.", None
-        else:
-            return ctrlsum_for_str(text, length, keywords), text, None
-    else:
-        filename = ""
-        url = url.strip()
-        if url != "":
-            if len(url) > 4 and url[-3:] == "pdf":
-                if url.strip() in acl_dict.keys():
-                    raw_text = acl_dict[url]["raw_text"]
-                    l = str(length)
-                    if length == 0:
-                        l = ""
-                    if l in acl_dict[url].keys():
-                        if keywords.strip() in acl_dict[url][l].keys():
-                            summary = acl_dict[url][l][keywords]
-                            return summary, raw_text, None
-                    if keywords.strip() == "":
-                        keywords = None
-                    if l == "":
-                        l = None
-                    return ctrlsum_for_str(raw_text, int(l), keywords), raw_text, None
-                filename = download_pdf(url, './cache/')
-            else:
-                "Invalid url(Not PDF)!", None, None
-        else:
-            filename = input.name
-        if keywords != "":
-            keywords = keywords.strip().split(",")
-            if keywords[0] == "":
-                keywords = None
-        if length == 0:
-            length = None
-        # Identify the format of input and parse reference strings
-        if filename[-4:] == ".txt":
-            results = ctrlsum_pipeline.predict(filename, type="txt",
-                                               save_results=False,
-                                               length=length, keywords=keywords, num_beams=1)
-        elif filename[-4:] == ".pdf":
-            results = ctrlsum_pipeline.predict(filename,
-                                               save_results=False, length=length, keywords=keywords, num_beams=1)
-        else:
-            return "File Format Error !", None, filename
-        output = []
-        for res in results["summary"]:
-            output.append(f"{res}\n\n")
-        return "".join(output), results["raw_text"], filename
-ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "

dataset_extraction.py DELETED Viewed

@@ -1,44 +0,0 @@
-from typing import List, Tuple
-import torch
-import nltk
-from SciAssist import DatasetExtraction
-device = "gpu" if torch.cuda.is_available() else "cpu"
-de_pipeline = DatasetExtraction(os_name="nt", device=device)
-def de_for_str(input):
-    list_input = nltk.sent_tokenize(input)
-    results = de_pipeline.extract(list_input, type="str", save_results=False)
-    # output = []
-    # for res in results["dataset_mentions"]:
-    #     output.append(f"{res}\n\n")
-    # return "".join(output)
-    output = []
-    for mention_pair in results["dataset_mentions"]:
-        output.append((mention_pair[0], mention_pair[1]))
-        output.append(("\n\n", None))
-    return output
-def de_for_file(input):
-    if input == None:
-        return None
-    filename = input.name
-    # Identify the format of input and parse reference strings
-    if filename[-4:] == ".txt":
-        results = de_pipeline.extract(filename, type="txt", save_results=False)
-    elif filename[-4:] == ".pdf":
-        results = de_pipeline.extract(filename, type="pdf", save_results=False)
-    else:
-        return [("File Format Error !", None)]
-    output = []
-    for mention_pair in results["dataset_mentions"]:
-        output.append((mention_pair[0], mention_pair[1]))
-        output.append(("\n\n", None))
-    return output
-de_str_example = "BAKIS incorporates information derived from the bank balance sheets and supervisory reports of all German banks ."

description.py DELETED Viewed

@@ -1,76 +0,0 @@
-# Reference string parsing Markdown
-rsp_title_md = '''
-## Reference String Parsing parses a citation string, extracting information such as the title, authors, and publication date.
-'''
-rsp_str_md = '''
-To **test on strings**, simply input one or more strings.
-'''
-rsp_file_md = '''
-To **test on a file**, the input can be:
-- A txt file which contains a reference string in each line.
-- A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
-'''
-# - A pdf file which contains a whole scientific document without any processing (including title, author...).
-ssum_str_md = '''
-To **test on strings**, simply input a string.
-'''
-ssum_file_md = '''
-To **test on a file**, the input can be:
-- A txt file which contains the content to be summarized.
-- A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
-'''
-# - The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
-ctrlsum_str_md = '''
-To **test on strings**, simply input a string.
-**Note**:
-- Length 0 will exert no control over length.
-'''
-ctrlsum_file_md = '''
-This is the demo for **CocoSciSum**.
-## Controlled Summarization uses FLAN-T5 to generate user-customised summaries from your input file or URL link.
-To **test on a file**, the input can be:
-- A txt file which contains the content to be summarized.
-- A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
-'''
-de_title_md = '''
-## Dataset Extraction detects dataset mentions from the input text.
-'''
-de_str_md = '''
-To **test on strings**, please input your sentences or paragraphs.
-'''
-de_file_md = '''
-To **test on a file**, the input can be:
-- A txt file which contains the content to be extracted dataset mentions from.
-- A pdf file which contains a whole scientific documention without any preprocessing (including title, author, body text...).
-'''

examples/127.txt DELETED Viewed

@@ -1,499 +0,0 @@
-Our results uncover a so far undocumented ability of the interbank market to distinguish between banks of different quality in times of aggregate distress.
-We show empirical evidence that during the 2007 financial crisis the inability of some banks to roll over their interbank debt was not due to a failure of the interbank market per se but rather to specific shocks affecting banks' capital, liquidity and credit quality as well as revised bank-level risk perceptions.
-Relationship banking is not capable of containing these frictions, as hard information seems to dominate soft information.
-In detail, we explore determinants of the formation and resilience of interbank lending relationships by analyzing an extensive dataset comprising over 1.9 million interbank relationships of more than 3,500 German Keywords: financial stability, interbank market, aggregate and idiosyncratic shocks, relationship banking, risk perception, market discipline *
-We wish to thank Steven Ongena, Lui-Hsian Wong and the seminar participants at the Deutsche Bundesbank for their helpful comments and suggestions.
-The views expressed in this paper are those of the authors and do not necessarily reflect the views of the Deutsche Bundesbank, the Eurosystem or its staff.
-Email: [email protected] (Peter Bednarek), [email protected] (Valeriya Dinger) and [email protected] (Natalja von Observing the interbank market distress of 2007-2008, major central banks around the world tried to contain the macroeconomic consequences by means of broad interventions, including not only injecting additional liquidity into the banking sector but also an adjustment of monetary policy instruments (Gabrieli and Georg, 2014).
-The large scale of these policies over the past few years has effectively made central banks the main money market intermediaries (Bräuning and Fecht, 2012).
-Such interventions have been successful in preventing liquidity crunches, but come at the cost of neglecting the market discipline mechanism inherent to the interbank market.
-The question of how reliable the functioning of market discipline is in times of aggregate distress is therefore crucial for an evaluation of the benefits and costs of the interbank interventions.
-In this regard, empirical research has already documented the role played by the intensity of interbank relations for the availability and the conditions of interbank borrowing in times of crisis (Cocco et al., interbank positions change in response to idiosyncratic shocks and whether this change is contingent in times of aggregate distress.
-In this paper, we close this gap and empirically examine the sensitivity of bilateral interbank positions to both aggregate and idiosyncratic shocks.
-We employ several definitions of idiosyncratic shocks which are based on measuring the relative deterioration of a bank's capital, liquidity or credit quality.
-We then study how idiosyncratic shocks which hit the borrowing or lending bank affect the intensity of interbank positions in normal times and time of aggregate distress.
-By disentangling the role of idiosyncratic and aggregate shocks, we aim to provide evidence of whether the turmoil in the interbank market was due to a general failure of the interbank market in reallocating liquidity efficiently within the banking sector itself, or rather to revised bank-level risk perceptions that lead to a stressed money market.
-The study is based on data on the bilateral exposures of German banks for the period 2000Q1 to 2012Q3.
-We employ a two-stage estimation model which first evaluates the probability of the existence of a bilateral interbank position and then estimates the determinants of the volume of this position.
-Our results show that aggregate distress has a statistically significant negative effect on bilateral interbank exposures, although, in economic terms, idiosyncratic shocks are economically by far more important.
-Also, we find that interbank positions react to idiosyncratic shocks even if the market as a whole is in distress.
-More specifically, we show that lending banks statistically and economically reduce their exposures to banks that have suffered idiosyncratic shocks.
-In terms of existing relationships, we find that these are not fully terminated following a shock but that their intensity is reduced.
-Interestingly, the intensity of bilateral exposures is driven not only by the shocks that hit the borrowing bank but also by those that hit the lending bank.1
-In the case of borrowing banks, we find that the intensity of the interbank relation is sensitive to shocks to capital, liquidity and to credit quality.
-This sensitivity is highest, however, for the shocks to a borrower bank's liquidity position.
-In the case of lending banks, we find that shocks to their capitalization do not affect the intensity of interbank relations, but shocks to liquidity and, in particular, shocks to credit quality have a strong negative effect.
-Further, the effects are nonlinear for both borrowing and lending banks.
-They are also contingent on the aggregate state of the financial system.
-Moreover, we explore whether relationship banking can outweigh the negative effects induced by bank-specific shocks.
-Unlike results of earlier studies, which find that relationship banking helps to overcome financial instability, we show distinct evidence that hard information seems to dominate soft information, as neither longer nor more intense 1
-This is contrary to Afonso et al. (2011) who find a relationship for the US interbank market between bank characteristics and the volume of the exposure only for the borrowing banks but not for the lending banks.
-interbank relationships in the past contain the negative effects of either aggregate or idiosyncratic shocks regarding the banks' capital, credit quality or liquidity.
-A number of features underline the novelty of our paper relative to the existing literature.
-First, we control not only for the volume but also for the existence of lending.
-This allows us to explore both intensive and extensive margins of interbank market dynamics.
-In this regard, we are the first to utilize a Heckman Correction methodology to counter the empirical problem of sample selection arising from the fact that banks participating in a bilateral interbank relation may differ in important unmeasured ways from banks which do not participate.
-For example differing business models may foster interbank market participation or restrain banks from doing so.
-Hence, we provide insights into both the main drivers which increase or decrease the probability of forming bank-to-bank relationships, as well as their impact on interbank lending exposures.
-Second, the length of our sample allows us to make comparisons between normal and crisis times.
-In detail, we analyze the most extensive dataset so far comprising over 1.9 million interbank lending relationships of more than 3,500 German banks conducted between 2000Q1 and Our results contribute to several strands of the literature.
-To start with, by showing that interbank exposure, even in times of aggregate distress, is related to the conditions of the borrowing bank, we confirm an insight gained from various studies on market discipline in (1992), for example, also find that banks are well-informed parties in judging the solvency of illiquid peer banks.
-This view has been debated by Goodfriend (2002) and Martin and McAndrews (2007).
-These papers claim that banks are not apt to monitor other banks, because the implicit guarantee supplied by central banks, which are expected to intervene in case of crisis, undermine banks' incentives to monitor their peers.
-More recent studies, reconcile the two sides of the debate by finding that banks possess knowledge regarding other banks' health, even while highlighting that this is only complementary to the knowledge of central banks.
-More specifically, Furfine (2001) documents that interbank interest rates in the US federal funds market reflect in part the credit risk of the borrowing banks.
-Similarly, King (2008) demonstrates that high-risk banks pay more than safe banks for interbank loans.
-Dinger and von Hagen (2009) show that in systems characterized by longer-term interbank exposures the monitoring role of lenders is more important, and Bräuning and Fecht (2012) find evidence for the existence of private information in the German interbank market, as relationship lenders were already charging higher interest rates to their borrowers in the run-up to the financial crisis of 2007-2008, whereas, during the crisis, borrowers paid lower rates on average to their relationship lenders than to spot lenders.
-While, in the case of a well-functioning interbank market, the evidence on peer monitoring is mixed, for times of aggregate market turmoil most existing literature predicts that market discipline will be further undermined by a lower sensitivity to fundamentals common that information asymmetry becomes worse during a crisis when the percentage of risky banks goes up and investors are unable to differentiate among the credit risks of individual banks.
-As a result, lenders require a higher yield to participate in the market.
-In cases of particularly severe distress, adverse selection issues can generate a complete freeze of the interbank market.
-Following this argument, central banks should intervene as a lender of last resort in order to prevent liquidity distress of solvent banks.
-The results of our study contradict this view and uncover a so far undocumented ability on the part of the interbank market to distinguish between banks of different quality in times of aggregate distress.
-The remainder of our paper is organized as follows.
-In Section 2 we describe the data.
-The main estimation results are presented in Section 4.
-Section 5 describes a battery of robustness tests.
-We construct a unique unbalanced panel bank-to-bank level dataset that contains information about the German interbank market from the first quarter of 2000 to the third quarter of 2012.23 The construction of the dataset makes use of several data sources.
-The central source is the Deutsche Bundesbank's credit register data (MiMik) which contains information on all big individual exposures of German banks to firms (including other banks).
-This source gives us information on whether a bank with a German charter has lent to any other banks and, if so, how much of the interbank lending is outstanding at the end of each quarter.
-Next, we add information from the balance sheet of the lending and borrowing banks.
-This information stems from the monthly balance sheet statistics Bista and BAKIS.4 Moreover, we utilize the banks' estimates of their counterparty's probability of default (PD) which has been part of the general MiMik dataset since 2008.5 Panel A of Table 1 provides summary statistics on the number of banks, their distinct bank group and the number of bank-quarter observations on those entities as well as the overall number of observations.
-In total, our dataset covers an extensive amount of 4.6 million bank-quarter observations on a total of 3,550 German banks.
-In around 40% of 2 Details on the credit register can be found in Schmieder (2006), and in published work by Schertler et al. (2006), Hayden et al. (2007) and Ongena et al. (2012), for example.
-The Bundesbank also maintains a website with working papers based on its credit register.
-For a more detailed definition, see Section 14 of the Banking Act (Deutsche Bundesbank, 2001).
-If exposures existed during the reporting period but are partly or fully repaid, the remaining exposure is reported even if the amount is zero.
-Due data limitations we take the actual amount of exposures into consideration that is the reported end-quarter amounts.
-4 We match the end of the quarter value of the Bista variables to the quarterly frequency of the interbank data.
-A few balance sheet items - such as non-performing loans - are not covered by Bista.
-We therefore uncover them from BAKIS, which is an information system that is shared between the Bundesbank and BaFin (the German Federal Banking Supervisory Office) and comes with annual frequency.
-Each counterparty is assessed by several different creditor banks; we take the median value of all estimated PDs.
-these bank-quarter observations we detect actual bank lending relationships between a creditor bank C and a borrower bank B.
-In the minority of the cases, lending is conducted between banks belonging to the same bank holding company (BHC).
-Surprisingly, we detect a considerable amount of reciprocal lending relationships, that is more than 820,000 bank-quarter observations show a pattern of a contemporaneous reverse lending from the initial borrowing bank B to creditor bank C. Moreover, the German interbank market is not fragmented along the lines of the traditional three-pillar structure of the German banking system, in which private commercial banks form the first pillar, public banks, such as Landesbanken and saving banks, form the second pillar, and cooperative banks the third pillar.
-We detect a considerable interconnection between all market participants, where the large banks such as the big, i.e. major banks, regional banks and the Landesbanken emerge mostly as borrowers, and savings and cooperative banks emerge as lenders (see Craig et al. 2015).
-For instance, savings banks provide lending not only to the Landesbanken
-but also to private mortgage banks and big banks.
-Panel B of Table 1 provides descriptive statistics on these interbank exposures as well as some initial impressions of how German interbank relationships are structured.
-Interbank exposures and, especially, reciprocal exposures exhibit a strong variance with mean values of 51 million and 86 million and a standard deviation (SD) of around 0.9 billion and 1.4 billion, respectively.
-Following Furfine (1999), we measure the strength of an interbank relation by (i) the duration of the bilateral exposure, as well as by (ii) the concentration of the banks' lending and/or borrowing activity.
-Regarding the relationship's duration, we calculate Credit relation span by adding up the bank quarters of a creditor bank C providing continuous lending to a specific borrower bank B. As in the case of interbank lending, both borrower and creditor are financial institutions and can, for instance, cooperate by mutually providing liquidity to each other.
-We also consider the possible two-sided nature of interbank relationships by computing Reciproc relation span by adding up the quarters the current borrower bank B is continuously lending to creditor bank C. Accordingly, Total relation span adds up the number of quarters in which both banks C and B are related to each other in either direction.
-In line with Petersen and Rajan (1994) Total relation span is a proxy for private information mitigating problems of asymmetric information.
-Overall, interbank relationships between distinct bank pairs last on average for around three years.
-If a relationship breaks at some point, it takes approximately the same amount of time for a relation to be re-established.
-Regarding the concentration on one lender/borrower, we follow Cocco et al.
-(2009) and Bräuning and Fecht (2012)
-and compute the amount lent by a creditor bank C to a borrower bank B relative to the overall amount lent by bank C in any distinct quarter .
-Formally, this lender preference index (LPI) is defined as whereas we set the variable to zero if the denominator is zero, i.e. if the lender did not lend at all.
-Similarly, we compute the borrower preference index (BPI) as the amount borrowed by bank B from bank C relative to the overall borrowing by bank B in quarter Again we detect a considerable high variance with some banks lending to and borrowing from only a single counterparty, whereas the mean values of the indices are 6.1% and Last, Panel C provides descriptive statistics on the most important bank characteristics.6 Regarding size, most banks in our sample are rather small ones with total assets amounting to 378 million, but with 3.6 billion as a mean value.
-In general, 6 To control for spurious outliers we delete all observations except Size at the 0.1% level.
-As robustness checks we rerun our specifications with varying measures or without any outlier correction measures.
-Results do not change qualitatively or quantitatively .
-regulatory capital ratios (CAPR) are quite high with a mean (median) value of 20.4% (13.8%).
-The importance of the traditional bank loan for financial intermediation in Germany is mirrored by the loans to asset ratio (LAR), as loans to non-financials comprise around 60% of the banks' balance sheet.
-Around 4% of those loans are non-performing (NPLR)7. 20% of the banks' assets are liquid (LIQR) and the return on risk-weighted This rich data source makes it possible to observe the behavior of nearly the entire German interbank market and the use of the bank-specific balance sheet information enables us to analyze the most important determinants of interbank market (in)stability.
-However, before we can make meaningful causal inferences some methodological shortcomings have to be solved.
-First, between the first quarter of 2000 to the third quarter of 2012 a number of bank mergers took place.
-We carry out a merger correction procedure by creating a new separate bank after the merger takes place.8
-The relationships' duration still amounts to nearly three years, which should be a sufficient amount of time to overcome asymmetric information due to relationship banking (Rochet & Tirole, 1996).
-Nevertheless, results are robust to alternative specifications.
-Second, and most important, we have to account for the possibility of an endogenous sample selection, as around 60% of our bank quarters do not contain an interbank lending relationship, because either banks stopped participating in the interbank market in general 7 Especially saving and cooperative banks exhibit high values of non-performing loans.
-Our approach is based on separating the pre-merger banks from the merged bank.
-In the end, we have three banks, which are treated independently from each other.
-We repeat this procedure as often as a merger takes place.
-Each time a new merged bank receives a new identification number, we drop the target banks in that quarter.
-or interrupted a specific interbank relationship.
-A sample selection bias may arise if the sample consists only of banks which choose to participate in the interbank market and these banks differ in important unmeasured ways from banks which do not participate.
-We utilize the Heckman Correction methodology to overcome this issue.
-That is, we first estimate the probability of an interbank lending relationship taking place with a Probit with being a vector of explanatory variables, a vector of unknown parameters and the cumulative distribution function of the standard normal distribution.9 Afterwards, we compute the inverse Mills ratio as the ratio between the standard normal probability density function and the standard normal cumulative distribution function , each evaluated at observation , and utilize finally in the second step as a further regressor in a standard OLS regression model.10 The dependent variable for the second step is the logarithmic change in the exposure of creditor bank C to borrower bank B and is defined as Moreover, to compare our results with those of earlier studies we also employ LPI and BPI concentration measures as proxies for the change in the intensity of an interbank relation.
-Accordingly, we estimate the following baseline regression model with parameters 9 We use clustered standard errors with the lending relationship between creditor bank C and borrower bank B as our cluster variable.
-where is the vector of explanatory right-hand side (RHS) variables, a vector of unknown parameters, the unknown parameter of the estimated inverse Mills ratio is the composite error term including the time invariant unobserved effect.
-In detail, is a varying time dummy variable capturing the effects of the 2007 financial crisis is a vector of relationship variables defined as which is a proxy for private information.
-As described in the previous section captures the interbank history of a specific pair of banks C and B by adding up the quarters in which those two banks have either a lending or borrowing relationship in .
-To proxy the bank's relationship intensity we use the logarithm of the lagged exposure from the creditor bank C to the borrower bank B, ln() Moreover, to analyze the effect of reciprocity we also utilize the reciprocal lending from the initial borrower bank B to the creditor bank C, ln this variable we take the contemporaneous values, since we are particularly interested in exploring whether truly reciprocal exposure increases the stability of the relation11.
-And finally, we account with a dummy variable for banks belonging to the same bank holding company (BHC) where the variable takes the value of one if both banks belong in quarter to the same BHC and zero otherwise.
-Regarding the set of control variables, we use standard bank characteristics with a one quarter lag and a set of dummy 11 Results do not change qualitatively or quantitatively if we utilize reciprocal exposure with one quarter lag.
-variables classifying each bank in any distinct quarter into a specific bank group listed in Panel A of Table 1.
-We utilize both types of controls for every bank .
-More precisely, bank-specific characteristics are the bank's logarithm of total assets ), the liquidity to assets ratio ( ), the regulatory capital ratio ) and the return on risk-weighted assets ratio (() Table 2 describes all variables employed in the estimations.
-Its Panel A illustrates the left-hand side (LHS) variables, while Panel B is focused on the right-hand side (RHS) variables, including our fix set of control variables.
-To give an answer to the question of whether the German interbank market was frozen due to an aggregate shock disabling an efficient liquidity allocation or whether it was partially stressed due to bank-specific shocks and possibly revised risk perceptions, we expand this Heckit baseline model consisting of both model (3) and (6) stepwise.
-First, to analyze whether a longer or stronger interbank relationship in the past mitigates possible negative effects, we expand the plain baseline models by interaction terms of the Second, we augment the baseline models by () which is a vector of different lagged risk measures for every bank defined as and to analyze whether risk perception changes to some extent during periods of aggregate distress, we estimate an interaction term model also with an interaction term of 12 As robustness checks we utilize varying sets of control variables and use varying lags for our main variables of interest.
-Furthermore we rerun the models for private banks only, i.e without Landesbanken, savings and cooperative and cooperative central banks.
-Results do not change qualitatively or quantitatively.
-Third, and most important, we expand the baseline models (3) and (6) by an alternating set of idiosyncratic shock variables.
-In detail, we compute idiosyncratic shocks at the bases of the creditor, respectively the borrower bank's capitalization (), credit quality (), liquidity () and profitability (()).
-Further, we specify shocks regarding the bank's and .
-Our framework distinctively expands those of existing studies.
-For instance, Afonso et al. (2011) concentrate on the banks' non-performing loans and profitability, whereas Cocco et al. (2009) and Bräuning and Fecht (2012) do not explicitly account for these and measure liquidity risks solely via reserve holdings and the banks' maturity mismatch.
-Moreover, to the best of our knowledge we are the first to account for a possible non-linear behavior of these determinants by employing the following method to determine bank-specific shocks.13 First, we construct the yearly distribution of each of the above variables and divide this distribution into its ten deciles.
-In a second step, we define an idiosyncratic shock as an alternating dummy variable that takes the value one if the value of the respective variable for the bank has moved by 1 (2,..., 9) decile(s) in an unfavorable direction from one quarter to another and zero otherwise.
-All in all, the basic idea is to stress-test somewhat not the bank's balance sheets to an unfavorable macroeconomic scenario, but rather the interbank relations to detect breaking points that, in turn, destabilize the interbank market itself.
-Hence, we expand both steps of the baseline Heckman Correction models by the following term which determines creditor and borrower bank specific shock variables for every underlying shock variable of any strength .
-It can be seen that we run a comprehensive set of regressions analyzes in which the idiosyncratic shock variable changes in two 13 Results of unreported tests where we examine the effect of quadratic terms indicate a non-linear behavior of those underlying bank determinants.
-First, with regard to the potential shock, we want to analyze shocks of the bank's capitalization, credit quality, liquidity, profitability and risk.
-And, second, the idiosyncratic shock variable alters regarding the strength of the shock, i.e. whether it is a moderate or a more serious shock, such as a heavy slip from one quarter to another amounting to several deciles in the underlying variable's distribution.
-Lastly, to analyze possible differences between the crisis and the non-crisis period we estimate an interaction term model with an interaction term of the following form and to answer the question of whether relationship banking, i.e. a longer and more intense interbank relationship in the past, can help to overcome possible negative effects of idiosyncratic shocks, we expand the baseline models (3) and (6), finally, by 4.1 RELATIONSHIP BANKING AND THE 2007 FINANCIAL CRISIS We start by presenting the results of the baseline regression model of the determinants of interbank lending and the effects of the 2007 financial crisis period in Table 3.
-To capture the effect of an aggregate shock we utilize a variable which is a time this quarter, several important events happened likely to disrupt market confidence, triggering general market turmoil, such as the announcement by the German bank IKB that it was in distress on July 30th and the close-down of two BNP Paribas funds on August 9th.14
-Additionally, we run robustness tests with altering crisis period definitions, for example also splitting the crisis period into different sub-crisis periods, such as the the Euro crisis (2010Q1 - 2012Q3).
-Results of the latter, disaggregated definition are presented in Column 2, 4, 6 and 8.
-Nevertheless, as results applying these alternating definitions do not vary a lot either economically or statistically, we adhere to the aggregated definition in subsequent analyses.
-With regard to the parameter estimates, columns 1 and 2 depict the results of the first step of the Heckman Correction method where the dependent variable is Credit relation, which is a binary variable taking the value one if there is a specific lending relationship between a creditor bank C and a borrower bank B, and zero otherwise.
-The results of the second step of the Heckman correction method are presented in columns 3 to 8, where the dependent variable in columns 3 and 4 is the Exposure change in log differences, the lender preference index (LPI) in columns 5 and 6, and the borrower preference index (BPI) in columns 7 and 8, respectively.
-Not surprisingly, we detect a highly significant negative effect of on the probability of establishing an interbank lending relationship, although the effect is most severe in the commercial paper and euro crisis period.
-The negative coefficients can be interpreted to some extent as rising search costs due to the inability to assess institutions' risk during the crisis.
-However, the actual economical effect is rather small.
-Unreported marginal effects show a decrease of between 1.5% and 9.6% in the probability.15 In contrast, distinctively affects the lender and borrower preference indexes leading to a higher concentration of interbank lending and borrowing.
-It is unclear whether this is due to creditor banks tending to lend to a smaller number of banks and perhaps staying with those with which they have a stronger interbank relationship.
-As we do not 14 As BNP Paribas became the first major financial group to acknowledge the impact of the sub-prime crisis by closing those two funds exposed to it, this date is generally seen as the start of the global credit crisis.
-15 To draw conclusions about the economic effects, we estimate both the probit model's marginal effects at mean (MEM) and its average marginal effects (AME) (Williams, 2011).
-observe the price for liquidity, it could also be the case that borrower banks shift their borrowing to banks that provide them with cheaper liquidity.
-Indeed, Bräuning and Fecht (2012) show some evidence that, at the height of the 2007 financial crisis, relationship lenders charged lower interest rates than spot lenders.
-Regarding the actual interbank exposures, we do see a negative effect of but not a decisively strong one.
-Though the coefficient is a substantially higher one instituted the long-term refinancing operation (LTRO) programs in which banks could lend in total over a trillion euros for a period of up to three years.
-Following Gabrieli and Georg (2014) who point out that the striking increase in risk premia in the Eurozone money market in 2008Q3 was clearly subsequent to rather than before the change in the operational framework involving a switch from a regular variable-rate tender procedure to a fixed-rate full allotment policy, it is more likely that those exceptional measures are the cause rather than the outcome of the reduced interbank lending activity.16
-Nevertheless, aggregated interbank lending is remarkably stable over time (Gabrieli and Georg, 2014).
-Figure 1 shows the amount of quarterly interbank lending as highly aggregated, while the solid line depicts interbank exposures without quarterly bank-to-bank exposures of 100 billion, and more and the dashed line shows aggregate interbank lending without exposures between banks belonging to the same BHC.17 The beginning of the aggregated crisis period is indicated by the vertical bar at 2007Q3.
-Although we do not adjust for price changes, it can be seen, however, that interbank exposures are surprisingly stable over time and actually rise to some extent even in distinct time frames of the crisis-period.
-Nevertheless, there is indeed a decrease in 16 Unreported robustness tests show that in the full allotment period (2008Q4) itself the likelihood of interbank participation significantly drops between 1.6 and 7 percent but we do not detect reduced interbank market exposures in that nor in the preceding quarters.
-Hence, from an aggregated point of view in the case of Germany one could question the need to change the operational framework.
-Especially as the Italian Interbank market was not affected by the 2007 financial crisis 17
-It is noteworthy that there is an upwards shift of excessive high bank-to-bank exposures of more than 100 billion since 2007Q3.
-All of these cases are conducted between parent banks and their affiliated mortgage banks.
-But as there is in some quarters of the crisis period only one such observation, we refrain from showing these data points.
-In general, excessive bank-to-bank exposures of more than 100 billion peak in 2008Q4 with an amount of 290 billion.
-interbank exposures after 2008Q3 and 2010Q4, i.e. following the non-standard measures taken by the ECB.
-4.1.1 DETERMINANTS OF RELATIONSHIP BANKING Besides the effects of the 2007 financial crisis period, we are particularly interested in the determinants that potentially foster bank-to-bank relationships.
-In this regard, all relationship proxies have a positive impact on the probability of renewing the lending relationship as well as on the concentration measures, except .
-In particular, belonging to same BHC strongly enhances the probability of a credit relationship and also the amount lent.
-Unreported marginal effects show an increase of up to 25% in the probability.
-Longer and stronger interbank relations in the past, on the other side, only impact positively on the probability of continuing lending, but do not lead to higher exposures.
-In fact, the opposite is true, implying that banks tend to hesitate to terminate relationships once they are established and instead prolong lending but on a reduced level, possibly avoiding risk concentration.
-In contrast, reciprocal lending shows the exact opposite results.
-Though it is negatively related to the probability of forming a lending relationship between a specific pair of a creditor bank C and a borrower bank B, reciprocal lending from the initial borrower B to creditor C leads to significantly higher exposures from C to B in the first place.
-The first result regarding the lower probability of forming a credit relationship due to reverse lending is not exactly odd, as it is possible to argue that borrower banks generally hesitate to lend during the same quarter in which they actually borrow.
-The second result however could be a initial indication that reverse relationship banking has a positive effect owing to the fact that it signals the bank's own soundness.
-Another possible explanation might be a swap in maturities.
-Unfortunately information on maturities is not directly available in our data.
-Creditor and borrower bank specific variables reveal unexpected results insofar as higher capital () and liquidity () do not to lead to higher interbank exposures.
-In general, better capitalized banks seem to avoid participating in the interbank market, maybe because they tend to engage in more profitable retail business rather than interbank lending activities and also have different ways of financing.
-Indeed, creditor banks with higher loans to assets ratios () are less likely to participate in the interbank market and provide less lending as well, while consequently borrowing more.
-Results regarding the creditor and borrower bank's liquidity are to some extent more puzzling, but, though they are statistically significant, they are economically negligible.
-In contrast, parameter estimates of indicate that larger banks are more likely to establish interbank lending relationships and that they receive and provide more interbank financing.
-As the borrower bank's coefficient is around seven times larger than the coefficient of the lender, it seems to be the case that this is not only due to the simple fact that larger banks are faced, on the one hand, with higher financing needs and, on the other hand, are also capable of providing more lending.
-For one thing, these results might reflect different business models.
-Descriptive statistics (Table 1 Panel A) already indicate that typically small banks, such as savings and cooperative banks, which can be characterized as retail deposit gathering institutions step in as interbank creditors, while larger banks such as big, regional and Landesbanken are mostly liquidity recipients.
-Nonetheless, it could also be the case that larger banks benefit from "too-big-to-fail" as there is a substantially higher likelihood of these banks being bailed out.
-In quantitative terms, a borrower bank's one SD increase in enhances its interbank market borrowing capacity by around 70 percentage points.
-Not surprisingly, higher profitability (()) also enhances the probability as well as the amount a bank can borrow via the interbank market.
-Results regarding the concentration measures are in line with common expectations, for instance larger banks lending to or borrowing from a larger number of counterparties.
-Finally, the highly significant and positive coefficient of the inverse mills ratio signifies that simple OLS would indeed produce upwardly biased estimates.
-4.2 INTERBANK RELATIONS AND RISK IN TIMES OF AGGREGATE MARKET TURMOIL Results of the previous section reveal a remarkably stable interbank market, which was, in fact, affected to a high degree statistically but, on an aggregated level, not economically by the ongoing 2007 financial crisis.
-Considering the non-standard measures of the ECB which provided nearly inexhaustible cheap liquidity and which even changed its monetary policy instruments owing to some banks' inability to roll over their interbank position, it could be asked how the above results fit into this reality.
-To shed some light on this question, we expand our baseline Heckit models which consists of the probit model (3) in the first and the corresponding OLS model (6) in the second step by interaction terms (8) and (10), that is we interact the aggregated variable with our relationship proxies and risk measures.
-Regarding the latter, we do only report the results for the nonperforming loans to total loans ratio ().
-As a robustness check, we utilize the bank's and for a sub-period since 2008Q1 the bank's as well, but qualitatively Panel A of Table 4 shows the parameter estimates of these models, while the interaction term models' corresponding marginal effects at representative values ( = 1|0) are shown in panel B.
-In both panels, columns 1 and 2 present results of the extended probit model where the dependent dummy variable is Credit relationship, and columns 3 and 4 present results of the corresponding extended OLS model with Exposure change in log differences as the dependent variable.
-Though PD parameter estimates show negative signs, only creditor banks exhibit a statistically significant reduction of interbank exposures.
-One possible explanation for these weak results might be given by Behn et al. (2014), who show that the introduction of Basel II-type, model-based capital regulation affected the validity of banks' internal risk estimates.
-They find that for the same firm (in our case bank) in the same year, both reported PDs and risk-weights are significantly lower, while estimation errors and loan losses are significantly higher for loans under the new regulatory approach.
-Thus, risk estimates for loans under the model-based approach systematically underestimate actual default rates.
-Also, results of the quadratic term model show a considerable decreasing effect of higher PDs for both creditor and borrower banks.
-Regarding the effects of relationship banking on interbank exposures in times of aggregate uncertainty, columns 1 and 3 of Panel B present the marginal effects of the relationship variables for the interaction term model of Panel A, that is the marginal effects of a longer, more intense and reciprocal interbank relationship in the aggregated crisis and the non-crisis period.
-Generally, the effects of relationship banking on interbank exposures are qualitatively the same as those in the baseline model.
-Though banks hesitate to terminate bank-to-bank relationships once they have been established, it does not determine persistent interbank lending, as unlike to Affinito (2012), we do not detect a significantly positive effect of relationship banking in the crisis period.
-Although longer and more intense relations in the past do slightly increase the probability of renewing interbank lending relations in both the crisis and the non-crisis period, we do not detect any positive effects regarding the amount lent.
-Only reciprocal lending again increases interbank exposures from the initial creditor bank.
-Moreover, the positive effect is in fact two times larger in the crisis than during the non-crisis period.
-Whether this is actually due the fact that the initial borrower bank signals its own soundness, since reverse lending is even more important in crisis periods than in non-crisis periods or whether this is due to maturity swaps in this period is a matter for future research.
-The most striking result, however, is that in contrast to Martinez-Peria and Schmukler (2001) and others who claim that, in periods of aggregate distress, information about fundamentals is diluted we show that the exact opposite is true with regard to the 2007 financial crisis.
-Columns 2 and 4 of Panel B present the marginal effects of the interaction term risk model of Panel A. Results reveal that the risk coefficient for borrower banks is more than five times larger in the crisis than during the non-crisis period.
-In other words, a one SD increase in risk reduces interbank exposures by around 18.7 percentage points during the crisis compared with a rather moderate decrease of 3.4 percentage points in the non-crisis period.
-Additionally, more risky creditor banks reduce their exposures less in the crisis than in the non-crisis periods and, in fact, are, overall, more likely to engage in the interbank market in times of aggregate distress.19 Unreported results regarding the concentration measures show that riskier borrower banks lend from more counterparties as the BPI coefficient is statistical highly significant negative.
-All in all, results uncover a so far undocumented ability of the interbank market to distinguish between banks of different quality in times of aggregate distress.
-As only the worst-performing banks have been rationalized by the interbank market, regulators should be reluctant to step in as a lender of last resort to avoid failures in liquidity reallocation fostering moral hazard.20 Moreover, relationship banking does not stabilize interbank lending during periods of aggregate turmoil, as hard information seems to dominate soft information.
-4.3 IDIOSYNCRATIC SHOCKS AND THE 2007 FINANCIAL CRISIS One major result of the previous section is that we find that, even during times of aggregate market turmoil and high uncertainty, the intensity of interbank relations reacts to the risk characteristics of the participating banks.
-This result suggests that idiosyncratic factors might be important drivers of interbank market outcomes.
-Hence, in this section we expand the analysis by exploring the role of a wide range of idiosyncratic bank shocks that capture banks' most important determinants.
-As described in Section 3.2, we run a set of regression analyses where the idiosyncratic shock variable changes in two dimensions.
-First, with regard to the potential shock we want to analyze that is a shock of the bank's capitalization, credit quality, liquidity, profitability and risk.
-And, second, the idiosyncratic shock variable alters regarding the strength of the shock that is whether it is a moderate or a more serious one, i.e. a heavy slip from one quarter to another of several deciles in the 19 In unreported tests we also examine the effect of quadratic terms of our risk measure and find a more concave risk-exposure relationship for borrower and a convex one for creditor banks, which confirms an increasing effect of risk for borrower banks and a diminishing effect for creditor banks.
-20 Indeed, unreported results show that in the full allotment period (2008Q4), that is the period where the ECB switched its operational framework from a regular variable-rate tender procedure to a fixed-rate full allotment policy, the markets' sensitivity to risk was rather impaired, as risk has a significantly negative impact on interbank borrowing outside the full allotment period and an insignificant one at that time.
-Nevertheless, this effect was not permanent, as banks generally exhibit a stronger sensitivity to risk in the crisis period than in the non-crisis period.
-underlying variable's distribution.21 All in all, the basic idea is to somewhat stress-test not the bank's balance sheets to an unfavorable macroeconomic scenario, but the interbank relations in order to detect breaking points which, in turn, destabilize the interbank market and to account for their non-linear behavior.
-4.3.1 IDIOSYNCRATIC SHOCKS AND INTERBANK MARKET STABILITY
-The outcome of this extensive procedure is illustrated in Figure 2, where every tile depicts both the sign and the significance of the regression model's bank-specific shock variables.22 In detail, it depicts the parameter estimates of bank-specific shocks regarding the creditor, respectively borrower bank's capitalization, credit quality and liquidity, with the dashed grey tiles representing significantly negative coefficients and the dotted white tiles denoting significantly positive coefficients.
-We present results only for these idiosyncratic shocks as they are the most important ones, severely affecting interbank relations and lending.23 Generally, the left- hand side of Figure 2 shows the results of the first step of the Heckman selection method and the right-hand side shows the results of the second step.
-Moreover, parameter estimates of the idiosyncratic shock variables of the baseline models (3) and (6) expanded by the creditor and borrower bank-specific shock variables (11) are presented in the first and third lines and marginal effects at representative values ( = 1|0) of the idiosyncratic shock variables of the baseline models (3) and (6) expanded by the interaction term (12) are shown in the second and fourth line marked by "in crisis".
-Starting with idiosyncratic shocks regarding the banks' capitalization, it can be seen that, similar to results of the bank characteristics in the baseline model presented in Section 4.1, lower capital ratios do not affect creditors' interbank exposure.
-In fact, even the most severe creditor specific capital shocks do not affect the probability of continuing 21
-It is important to point out that the distribution of each of the underlying idiosyncratic shock variables is computed at a yearly base, as definitions of what constitutes an adequate or unfavourable level regarding those variables may change over time.
-22 Underlying regression results of all idiosyncratic shocks tested are reported in the Appendix (available on request).
-23 Similar to results of our baseline model in Section 4.1, lower profitability does not affect interbank stability at all.
-Even after very heavy declines in profitability from one quarter to another, creditor banks do not reduce interbank lending nor do borrower banks face problems prolonging their interbank positions.
-Higher risk in terms of shocks regarding the banks' Z-score or PD did not impair interbank relationships in the recent crisis either.
-lending nor the amount lent.24 Idiosyncratic borrower capital shocks do show a different behavior, however, revealing two important insights.
-First, borrower-specific capital shocks affect both the probability of continuing an interbank lending relation and the actual exposure itself.
-Second, results show like in Section 4.2, some kind of revised risk perception as the capital shocks' negative effect is triggered earlier in the crisis period.
-Nevertheless, while even moderate capital deteriorations in the distribution from one quarter to another have a significantly negative impact on the probability of continuing an interbank relation, we do not detect an actual reduction in interbank exposures before a borrower banks' capital ratio slips in the crisis period four deciles in its yearly distribution or, in other words, after an idiosyncratic shock of the strength four.
-In quantitative terms, borrower banks suffer from capital write-offs not before generally losing 38% of their regulatory capital, or 43% in the crisis period.25
-The actual economic effects of such a severe idiosyncratic capital shock are presented in Table 5, where Panel A shows the parameter estimation results of the baseline Heckit models (3) and (6) expanded by the interaction term (12), and Panel B depicts the corresponding marginal effects at representative values ( = 1|0).
-In this regard, columns 1 and 2 present results of the interaction term model where the idiosyncratic shock variable is defined as a negative one decile change in the bank's capital ratio's distribution from one quarter to another.
-In columns 3 and 4 the shock is defined as a two decile change, and in the model presented in columns 5 and 6 the shock dummy variable takes the value one if the capital ratio slips four deciles or more, and zero otherwise.
-Results show a looming negative effect of borrower-specific capital shocks, starting with a slight decrease in the probability of continuing lending relationship in the case of a moderate shock of the strength two.
-The outcome of the actual breaking point is presented in Column 6 of Panel B, where we detect a reduction in lending of around 66 percentage points after a capital shock of the strength 24 We detect negative effects only for creditor banks that are in the worst decile of the yearly capital distribution.
-The mean regulatory capital reduction for a borrower bank in the case of an idiosyncratic shock of the size five is 7.79 percentage points, which refers to a capital reduction in relative terms of 38.18% regarding a mean capital ratio of 20.4%.
-In contrast, the mean capital ratio of a borrower bank in the crisis period is 23.77% and the capital reduction in the case of an idiosyncratic shock of the size four is 10.12 percentage points, which amounts to a relative capital reduction of 42.57%.
-Summing up, interbank relations are remarkably resistant with regard to bank-specific capital shocks, that is only severe capital-write offs of around 40% actually impair lending relationships.
-Most notable is the fact that idiosyncratic capital shocks affect interbank stability solely via the borrower side and even more in periods of aggregate turmoil.
-In contrast, creditor banks do not reduce their interbank market activity independently of their level of capitalization.
-Idiosyncratic liquidity shocks show results which are quite similar to those of shocks regarding the banks' capitalization.
-Again, creditor banks seem to be remarkably resistant to liquidity drains, but, in contrast even small bank-specific liquidity shocks affect borrower banks negatively, i.e. reducing interbank lending.
-Table 6 presents regression results of interaction term models with the idiosyncratic shock variable alternating from a one decile change in columns 1 and 2, over a two decile change in columns 3 and 4, up to a bank-specific shock of the strength three in columns 5 and 6, i.e. a three decile change in the yearly distribution of the creditor, and the borrower liquidity ratio from one quarter to another, respectively.
-In general, we detect a higher reduction in interbank exposures, the stronger the idiosyncratic shock is.
-But most interestingly, effects are nearly four times larger in the non-crisis period than in the actual crisis period.
-For instance, an idiosyncratic shock in the crisis period of the strength three, i.e. a loss of around 34% in the borrower banks' liquid assets reduces interbank exposures by 11 percentage points.
-In contrast, a bank-specific shock of the same strength in the non-crisis period leads to reduction in interbank exposures of nearly 44 percentage points.
-One possible explanation for liquidity shocks affecting interbank lending less in the non-crisis than in the actual crisis period might be the role played by the central bank in flooding the market with huge amounts of liquidity and acting as the central counterparty in large parts of the money market The banks' level of capitalization and liquidity has been an important and intensively discussed issue and problems of undercapitalization and insufficient liquidity have been addressed at the international level not only by Basel III, for example, but also at national and European levels by compelling banks to hold higher capital and liquidity buffers.
-Nevertheless we reveal another, so far broadly underexplored issue which plays a part in destabilizing interbank market stability, namely the banks' credit quality.
-In contrast to idiosyncratic capitalization and liquidity shocks, shocks regarding the banks' credit quality impair interbank relations not just from one side of the lending relationship but also from the other.
-On the one hand, creditor banks withdraw from the interbank market by reducing lending and, on the other hand, borrower banks are becoming less financed as well.
-Table 7 provides some detailed results on the interaction models' parameter estimates, where columns 1 and 2 depict shocks of the strength one, columns 3 and 4 show shocks of the strength four, and columns 5 and 6 contain shocks of the strength eight.
-In line with results on the banks' capitalization, idiosyncratic credit quality shocks affect borrower banks distinctly more during the crisis period than in the non-crisis period.
-And, similar to capitalization shocks, we see a looming effect of credit quality shocks first affecting the probability of continuing the interbank lending relation, and, since a slip of three deciles in the distribution of the underlying variable, an increasing reduction of interbank exposures starting with a lending cut of 11percentage points, which ultimately adds up to a reduction of more than 75 percentage points in the case of a severe idiosyncratic shock of the 4.3.2 IDIOSYNCRATIC SHOCKS AND RELATIONSHIP BANKING So far we have demonstrated that idiosyncratic shocks heavily disturb interbank lending relations and can potentially impair market stability itself.
-As in Section 4.2, one can ask whether relationship banking in the form of a longer and more intensive interbank relationship in the past can help to overcome the negative effects of idiosyncratic shocks.
-To answer this question, we further expand both steps of the baseline Heckman Correction 26
-These results are to some extent mirrored by the ones of the quadratic term models which show for creditor banks a convex and for borrower banks a more concave relationship between the non-performing loans to total loans ratio and interbank exposure indicating a decreasing effect for the former and an increasing effect for the latter.
-models (3) and (6) by the interaction term (13), that is we interact the creditor and borrower bank-specific shocks with our relationship proxy variables.
-In contrast to Cocco et al. (2009), Affinito (2012) and others who present empirical evidence that relationship banking indeed helped to overcome market turmoil in the recent financial crisis, our results show that hard information dominates soft information.
-Tables 8 to 10 show parameter estimates of the interaction term models, where columns 1 and 2 present the regression coefficients and columns 3 and 4 the corresponding marginal Table 8 shows that the negative effects of a capital shock of the strength five cannot be undone either by a longer or a more intensive interbank relation in the past or even by reciprocal lending.
-It should be borne in mind that an idiosyncratic capital shock of that strength is the weakest possible shock that impairs interbank lending in general.
-Results of more severe shocks are analogous to those presented and imply that, in contrast to the literature on bank-firm customer relationships which predicts that banks ensure the availability of credit to customer firms when these firms are in trouble, does not hold in a bank-bank context.
-As the interbank market is able to distinguish between banks of different quality even in times of aggregate distress, hard information seems to dominate soft information.
-Likewise, Table 9 presents results of the case where a creditor or a borrower bank is hit by an idiosyncratic liquidity shock of the strength one which represents a slip of one decile in the underlying variable's yearly distribution.
-Though we do not detect a significant positive effect of relationship banking in terms of a longer and more intensive relationship, we do again present some evidence that reciprocal lending has a number of benefits if the borrower bank has been hit by such an idiosyncratic liquidity shock.
-A one SD increase in reciprocal lending that is lending from the initial borrower bank B to the creditor bank C, increases interbank lending by between 6 and 13 percentage points in the first place.
-We report marginal effects at representative values only for cases where the idiosyncratic shock exhibits an interbank lending reduction for the first time, that is in its weakest definition.
-The positive effect of reciprocal lending also shows up in cases where either the creditor or the borrower bank is hit by a shock regarding its credit quality (Table 10).
-As idiosyncratic shocks regarding the banks credit quality affect interbank lending from both sides of the interbank lending relationship, Panel A in Table 10 presents estimation results and the corresponding marginal effects of a credit quality shock of the strength one which affects creditor banks in particular and Panel B shows results of a shock the strength five when borrower banks also start to suffer from an exceptionally strong increase in their nonperforming loans to assets ratio (NPLR).
-In this regard, a one SD increase in reciprocal lending increases interbank lending to the stressed borrowing bank by between 16 and 22 percentage points and by between 12 and 14 percentage points even when it is the creditor bank that is in stress.
-All in all, results show that relationship banking is not distinctively capable of overcoming bank-specific, i.e. self-induced problems, as hard information seems to dominate.
-Only reciprocal lending does, to some extent, increase interbank lending activity, maybe due to signaling effects or maturity swaps.
-We employ a broad range of sensitivity analyses to assess the robustness of our findings.
-In general, we conduct various robustness checks on our overall dataset, such as the level at which we correct for outliers, the overall sample that we analyze, and the utilized merger correction procedure.
-We also conduct checks on the definition of our main variables of interest and the models' distinct specification for testing.
-We start with sensitivity analyses for the overall structure of our database.
-For the first set of control variables, namely bank characteristics, we delete outliers except for Size at the 1% level, but rerun our specifications without carrying out any outlier correction measures.
-In general, we utilize varying sets of control variables, such as alternative capital, liquidity and profitability ratios and specifications without bank group controls, or without any set of control variables at all.
-Regarding the sample size, we rerun the models for private banks only, that is without Landesbanken, savings and cooperative banks and central institutions of cooperative banks, as well for a sub-period since 2008 where we are able to utilize the banks' estimates of their counterparty's probability of default (PD).
-Finally, a number of bank mergers took place between the first quarter of 2000 to the third quarter of 2012.
-Therefore, we carry out a merger correction procedure by creating a new separate bank after the merger takes place.
-Generally, the duration of the relationships still amounts to nearly three years, which should be a sufficient amount of time to overcome asymmetric information due to relationship banking (Rochet & Tirole, 1996).
-Nevertheless, results are robust to alternative specifications.
-5.2.1 RELATION, CRISIS AND RISK MEASURES Second, regarding our main variables of interest, such as the relationship proxies, we use varying lags especially for those utilized in the baseline specification with contemporaneous values.
-Further, beyond splitting the aggregated crisis period into different sub-crisis periods like the commercial paper crisis (2007Q3 - 2008Q3), the starting points of these crises periods, we analyze a set of periods of special interest; for instance, periods in which the ECB switched its operational framework from a regular variable-rate tender procedure to a fixed-rate full allotment policy.
-Results show that in the full allotment period in 2008Q4 itself the likelihood of interbank participation drops significantly by between 1.6 and 7 percent, but we do not detect reduced interbank market exposures in that period or in the preceding quarters.
-Regarding the risk measure, we utilize not only the non-performing loans to total loans ratio (NPLR) but also the bank's Z-score, such as the bank's PD for a sub-period since 2008Q1.
-Above and beyond that, as we identify idiosyncratic shocks to be the most important determinants of interbank market stability, we examine a broad range of model specifications and modify the definition of an idiosyncratic shock in several ways.
-First, we redefine idiosyncratic shocks so that a shock is associated only with a drop into the second quartile of the distribution of the underlying shock variable.
-In other words, the shock dummy variable does not take the value one in those cases where the borrower or the creditor bank experiences a quarter-to-quarter slip, say, in their capital or liquidity ratio distribution from a high to a moderate point, but at least into the 50th percentile.
-Second, we extend our models by a dummy variable that takes the value one if a bank is already in the worst decile of the underlying variable's distribution, as those banks by definition do not exhibit an idiosyncratic shock.
-Results do not differ substantially from the ones presented; at the most, effects are to some extent even more pronounced.
-Finally, we examine the effect of quadratic terms, which do indeed display a non-linear behavior.
-For instance, we find a more concave risk-exposure relationship for borrower banks and a convex risk-exposure relationship for creditor banks, confirming an increasing effect of risk for borrower banks and a diminishing effect for creditor banks.
-Alongside the main idiosyncratic shocks presented, which severely affect interbank relations and lending, we also examine shocks of creditor banks' and borrower banks' Z-scores, PD and profitability.28
-As in results of our baseline model in Section 4.1, lower profitability does not affect interbank stability at all.
-Even after extremely sharp declines in profitability from one quarter to another creditor banks do not reduce interbank lending nor 28 See Appendix (available on request) do borrower banks face problems in prolonging their interbank positions.
-Higher risk in terms of shocks regarding the banks' Z-score did not impair interbank relationships in the recent crisis either.
-Similar to liquidity shocks, results show a difference between the crisis and the non-crisis periods as lower Z-scores destabilize interbank lending more in the non-crisis period than in the actual crisis period.
-In fact, lower Z-scores only reduce interbank lending in the non-crisis period (Appendix Table A10).
-Results regarding higher probabilities of default are to some extent sketchy, as we only detect interbank exposure reductions of up to 7.4 percentage points after a creditor, or borrower PD shock of the size of one (Appendix Table A5).
-One possible explanation might give Behn et al. (2014), who show that the introduction of Basel II-type, model-based capital regulation affected the validity of banks' internal risk estimates.
-Also, results of the quadratic term model show a considerable decreasing effect of higher PDs for both creditor and borrower banks.
-Though the importance of interbank relations for the distribution of liquidity is well recognized, the main drivers that foster the persistence and the strength of interbank relations or trigger their collapse are as yet unknown.
-In this study we present novel evidence of the microeconometric determinants of banks' bilateral positions.
-In particular, while existing research is mostly concerned with the effects of aggregate shocks, such as the 2007 commercial paper crisis or the Lehman insolvency, on the functioning of interbank relations, we focus on the so far underexplored importance of idiosyncratic bank shocks that is shocks with regard to distinct individual bank's balance sheet positions.
-By disentangling the effects and the inherently differing information content of aggregate and idiosyncratic shocks, we provide evidence of whether some banks' inability to roll over their interbank positions in the recent financial crisis was due to a failure of the interbank market in reallocating liquidity efficiently within the banking sector itself, i.e. a frozen interbank market, or rather to revised bank-level risk perceptions that lead to a stressed money market.
-Our results clearly confirm the latter proposition.
-Though detecting a statistically significant but small reduction in the bank-to-bank exposures due to the crisis we clearly identify idiosyncratic shocks to be substantially more important for the recent disruptions on the interbank market.
-Indeed, banks avoid terminating interbank relationships, but, economically and statistically, they reduce their exposures based on hard information about their peers.
-Moreover, identifying idiosyncratic shocks as the main driver disrupting interbank lending, we also analyze the effects of risk taking and reciprocal behavior on the banks' bilateral exposures and test whether relationship banking can outweigh the negative effects induced by bank-specific shocks.
-Unlike earlier studies which find that relationship banking helps to overcome financial instability, we show distinct evidence that, except reciprocal lending, this is not the case for the German interbank market.
-Neither longer nor more intense interbank relationships in the past contain the negative effects of either aggregate or idiosyncratic shocks regarding the banks' capital, credit quality or liquidity.
-Summing up, our results show that the inability of some banks to roll over their interbank position and the ensuing financial market turmoil were not due to a failure of the interbank market per se but rather to bank-specific shocks affecting the banks' capital, liquidity and credit quality.
-Most importantly, the results uncover a so far undocumented ability of the interbank market to distinguish between banks of different quality in times of aggregate distress.
-And how did they function in Allen, F., E. Carletti, and D. Gale (2009).
-Interbank market liquidity and central bank Working Paper 75, SAFE.
-in Proceedings of a Conference on Bank Structure and Competition, Federal Reserve Bank of Chicago.
-Berger, A., S. Davies, and M. Flannery (2000).
-Comparing Market and Regulatory Assessments of Bank Performance: Who Knows What When?
-Journal of Money, Bhattacharya, S., and D. Gale (1987).
-Preference Shocks, Liquidity, and Central Bank Policy.
-in W. Barnett and K. Singelton (Eds.).
-Bräuning, F., and F. Fecht (2012).
-Relationship lending in the interbank market and the price of liquidity.
-Blueprints for a new global financial architecture.
-for the integration of the euro area money market?
-Cocco, J. F., F. Gomes, and N. Martins (2009).
-Lending relationships in the interbank Craig, B., F. Fecht, and G. Tümer-Alkan (2015).
-The role of interbank relationships and DeYoung, R., M. Flannery, W. Lans, and S. Sorescu (1998).
-The Informational Advantage of Specialized Monitors: The Case of Bank Examiners.
-Working paper, Federal Reserve Bank of Chicago.
-Dinger, V., and J. von Hagen (2009).
-Does Interbank Borrowing Reduce Bank Risk?
-Financial Crises, Payment System Problems, and Discount Window Liquidity Provision by the Central Bank.
-Journal of Money, Credit, and Banking 32(3), Freixas, X., and J. Jorge (2008).
-The Role of Interbank Markets in Monetary Policy: A Furfine, C. (1999).
-The microstructure of the federal funds market.
-Banks as Monitors of Other Banks: Evidence from the Overnight Furfine, C. (2002).
-The interbank market during a crisis.
-European Economic Review 46(4-Gabrieli S., and C.-P. Georg (2014).
-A network view on interbank market freezes.
-Goodfriend, M., and R. King (1988).
-Financial deregulation, monetary policy, and central banking.
-Federal Reserve Bank of Richmond Economic Review 74, 322.
-Interest on Reserves and Monetary Policy.
-Federal Reserve Bank Gyntelberg, J., and P. Wooldridge (2008).
-Interbank Rate Fixings during the Recent Turmoil.
-Bank for International Settlements Quarterly Review, 5972.
-Hasan, I., K. Jackowicz, O. Kowalewski, and L. Kozlowski (2013).
-Market discipline during crisis: Evidence from bank depositors in transition countries.
-Journal of Banking and Hayden, E., D. Porath, and N. Westernhagen (2007).
-Does Diversification Improve the Performance of German Banks?
-Evidence from Individual Bank Loan Portfolios.
-Heider, F., M. Hoerova, and C. Holthausen (2009).
-Liquidity Hoarding and Interbank Market Spreads: The Role of Counterparty Risk.
-Working Paper 1126, European Central Bank.
-Ho, T., and A. Saunders (1985).
-A Micro Model of the Federal Funds Market.
-Lender of last resort: A contemporary perspective.
-Discipline and Liquidity in the Interbank Market.
-Journal of Money, Levy-Yeyati, E., M. Martinez Peria.
-and S. Schmukler (2004): Market Discipline under Systemic Risk: Evidence from Bank Runs in Emerging Economies.
-Working Paper Martin, A., and J. McAndrews (2007).
-Why are there no intraday money markets?
-Mimeo, Federal Reserve Bank of New York.
-Martinez-Peria, M., and S. Schmukler (2001).
-Do Depositors Punish Banks for Bad Behavior?
-Market Discipline, Deposit Insurance, and Banking Crises.
-Journal of Michaud, F., and C. Upper (2008).
-Bank for International Settlements Quarterly Review, 4758.
-Ongena, S., G. Tümer-Alkan, and N. von Westernhagen (2012).
-Peek, J., E. Rosengren, and G. Tootell (1999).
-Is Bank Supervision Central to Central Petersen, M., and R. Rajan (1994).
-The Benefits of Lending Relationships: Evidence from Rochet, J.-C., and J. Tirole (1996).
-Journal of Schertler, A., C. Buch, and N. Westernhagen (2006).
-Heterogeneity in Lending and Sectoral Growth: Evidence from German Bank-level Data.
-The Deutsche Bundesbank's Large Credit Database (BAKIS-M and Schwartz, A. (1992).
-The Misuse of the Fed's Discount Window.
-Federal Reserve Bank of Taylor, J., and J. Williams (2009).
-A Black Swan in the Money Market.
-Using Stata's Margins Command to Estimate and Interpret Adjusted Predictions and Marginal Effects.
-This figure shows the amount of quarterly interbank lending in a highly aggregated form, where the solid line depicts interbank exposures excluding quarterly bank to bank exposures of 100 billion and more and the dashed line shows aggregate interbank lending excluding exposures between banks belonging to the same BHC.
-The beginning of the aggregated crisis period is indicated by the vertical bar at 2007Q3.
-It is noteworthy that there is an upwards shift of excessive high bank-to-bank exposures of more than 100 billion since 2007Q3.
-All of these cases are conducted between parent banks and their affiliated mortgage banks.
-But as there is in some quarters of the crisis period only one such observation, we refrain from showing these data points.
-In general, excessive bank-to-bank exposures Exposure (without 100 bil) Exposure (without BHC) FIGURE 2 MAIN IDIOSYNCRATIC
-SHOCKS Shock defined as a x decile Shock defined as a x decile This figure illustrates the parameter estimates of the baseline Heckman Two-Step Correction Model augmented by a creditor and borrower bank-specific shock and an interaction term of the idiosyncratic shock and the "Crisis" variable, which is a dummy variable that takes the value one from 2007Q3 onwards and zero otherwise.
-The bank-specific shock variable is an alternating dummy variable that takes the value one if there is a bad or unfavorable change in the distribution of the underlying shock variable of 1 (2,..., 9) decile(s) from one quarter to another and zero otherwise, whereas we portioned the distribution into 10 equal percentiles.
-In the first step of the Heckit Model, the left-hand side variable (LHS) is "Credit relationship", which is a dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise.
-The LHS variable for the second step is "Exposure change" in log differences.
-For the the right-hand side variables (RHS) we use all variables of the baseline regression model augmented by the idiosyncratic shock variable and the interaction term between the shock and the "Crisis" variable.
-Generally, the left-hand side of the figure shows results of the first step of the Heckman selection method, and the right-hand side results of the second step.
-Parameter estimates of the idiosyncratic shock variables of the baseline Heckit model augmented by the those shock variables are presented in the first and third lines, respectively.
-Marginal effects at representative values (Crisis=1|0) of the idiosyncratic shock variables of the baseline model augmented by the interaction term are illustrated in the second and fourth lines marked by "in crisis".
-The figure illustrates parameter estimates of idiosyncratic shocks regarding the creditor and borrower bank's capitalization, credit quality and liquidity, while the dashed grey tiles represent significantly negative coefficients and the dotted white tiles significantly positive coefficients.
-Regional bank Private mortgage bank Subsidiary of a foreign bank Public real estate credit agency Landesbank Bank with special functions Savings bank Foreign subsidiary of a German bank Panel A of this table shows summary statistics regarding the number of banks, their distinct bank group and the number of bank quarter observations regarding those entities as well as the overall number of observations.
-In this regard, BHC refers to bank holding company.
-Panel B provides summary statistics of (reciprocal) interbank exposures, concentration measures as well as summary statistics regarding the duration (break) of bank-to-bank relationships.
-Concentration measures are the lender preference index "LPI", which is the the amount lent by a creditor bank C to a borrower bank B relative to the overall amount lent by bank C in any distinct quarter, and the borrower preference index "BPI", which is calculated as the amount borrowed by bank B from bank C relative to the overall borrowing by bank B, respectively.
-"Credit relation span" adds up the bank quarters of a creditor bank C providing continuous lending to a specific borrower bank B, "Reciproc relation span" captures the continous reverse lending from bank B to bank C and "Total relation ship span" adds up the quarters both banks C and B are related to each other in either direction.
-Panel C provides descriptive statistics about the most important bank characteristics, whereas each bank's Z-score is calculated as the sum of the return on risk-wighted assets and the capital asset ratio divided by the return on risk-weighted assets' standard deviation.
-Panel A of this table shows summary statistics regarding the number of banks, their distinct bank group and the number of bank quarter observations regarding those entities as well as the overall number of observations.
-In this regard, BHC refers to bank holding company.
-Panel B provides summary statistics of (reciprocal) interbank exposures, concentration measures as well as summary statistics regarding the duration (break) of bank-to-bank relationships.
-Concentration measures are the lender preference index "LPI", which is the the amount lent by a creditor bank C to a borrower bank B relative to the overall amount lent by bank C in any distinct quarter, and the borrower preference index "BPI", which is calculated as the amount borrowed by bank B from bank C relative to the overall borrowing by bank B, respectively.
-"Credit relation span" adds up the bank quarters of a creditor bank C providing continuous lending to a specific borrower bank B, "Reciproc relation span" captures the continous reverse lending from bank B to bank C and "Total relation ship span" adds up the quarters both banks C and B are related to each other in either direction.
-Panel C provides descriptive statistics about the most important bank characteristics, whereas each bank's Z-score is calculated as the sum of the return on risk-wighted assets and the capital asset ratio divided by the return on risk-weighted assets' standard deviation.
-TABLE 2 DESCRIPTION OF VARIABLES PANEL A Left-hand side (LHS) Dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise.
-PANEL B Right-hand side (RHS) Crisis Dummy variable that takes the value one from 2007Q3 onwards and zero otherwise.
-Dummy variable that takes the value one between 2007Q3 and 2008Q3 and zero otherwise.
-Lehman crisis Dummy variable that takes the value one between 2008Q4 and 2009Q4and zero otherwise.
-0|1 Euro crisis Dummy variable that takes the value one between 2010Q1 and 2012Q3 and zero otherwise.
-Captures the interbank history of a specific pair of banks C and B by adding up the quarters these two banks have either a lending or borrowing relationship in quarter .
-Logarithm of the lagged exposure from the creditor bank C to the borrower bank B. ln Reciproc exposure Reciprocal lending from the initial borrower bank B to the creditor bank C ln BHC dummy Dummy variable for banks belonging to same bank holding company.
-0|1 NPLR Non-performing loans to total loans ratio % PD Median value of all creditor banks' C estimates on borrower bank's B probability of default.
-The idiosyncratic shock variable is an alternating dummy variable that takes the value one if there is a bad respectively unfavorable change in the distribution of the underlying shock variable x (= CAPR, NPLR, LIQR, portioned the distribution into 10 equal percentiles.
-Size Logarithm of total assets ln LAR Loans to asset ratio (without interbank loans)
-% LIQR Liquid assets to total assets ratio % CAPR Regulatory capital ratio % ROA(rw)
-Return on risk weighted assets %
-Heckman's lambda: Ratio between the standard normal probability density function and the standard normal cumulative distribution function , each evaluated at observation Dummy variables classifying each bank in any distinct quarter into a specific bank group listed in Panel Panel A of this table presents our left-hand side (LHS) and Panel B a comprehensive list of varying right-hand side (RHS) variables.
-Yes This table presents the estimation results of the baseline Heckman Two-Step Correction Model.
-In the first step, the left-hand side variable (LHS) is "Credit relationship" which is a dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise (Column 1 and 2).
-The LHS variable for the second step is either "Exposure change" in log differences (Column 3 and 4), the lender preference index "LPI" (Column 5 and 6) or the borrower preference index "BPI" (7 and 8).
-The first group of right-hand side variables (RHS) capture the effects of the 2007 financial crisis period by two differect crisis specifications.
-Columns 1, 3, 5 and 7 show results of the aggregated crisis period, where the "Crisis" variable is a dummy variable that takes the value one from 2007Q3 onwards and zero otherwise.
-Columns 2, 4, 6 and 8 present results where the crisis period is split up into a "Commercial Paper are dummy variables that take the value one in the definied period and zero otherwise.
-The second group of the RHS variables account for the banks' relationship intensity.
-"Total relation span" counts the number of sustained quarters bank C and bank B interact with each other, either as creditors or borrowers.
-"Exposure t - 1 " is the log pre-quarter exposure from the creditor bank C to borrower bank B, "Reciproc exposure" is the log reciprocal exposure from bank B to bank C, and the "BHC dummy" variable takes the value one if both banks belong to same bank holding company and zero otherwise.
-The third group of the RHS variables control for bank characteristics.
-We use the banks' balance sheet items with a one quarter lag and delete spurious outliers at the 1 percent level except "Size" which is the banks' log assets.
-Finally, we account for the creditor's and borrower's distinct bank groups, respectively.
-Credit relation Exposure change LPI BPI Crisis x Exposure t - 1 Crisis x NPLR creditor t - 4 Crisis x NPLR borrower t - 4 Baseline variables
-Yes Yes NPLR borrower t - 1 TABLE 4 INTERBANK RELATIONS & RISK IN TIMES OF AGGREGATE MARKET TURMOIL Panel A of this table presents the estimation results of the baseline Heckman Two-Step Correction Model augmented first by interaction terms between the aggregated "Crisis" variable and the bank-to-bank relationship proxies and second by interaction terms between the "Crisis" variable and a risk measure, namely the non-performing loans to asset ratio (NPLR) with a one year lag.
-The "Crisis" variable is a dummy variable that takes the value one from 2007Q3 onwards and zero otherwise.
-Panel B shows the marginal effects at representative values for these interaction term variables.
-In the first step of the Heckit Model, the left-hand side variable (LHS) is "Credit relationship", which is a dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise (Columns 1 and 2 in Panel A and B).
-The LHS variable for the second step is "Exposure change" in log differences (Columns 3 and 4 in Panel A and B).
-For the the right-hand side variables (RHS) we use all variables of the baseline regression model augmented by the interaction terms described above.
-Credit relation Exposure change Exposure change Panel A of this table shows the estimation results of the baseline Heckman Two-Step Correction Model augmented by an interaction term of a creditor and borrower bank-specific shock and the "Crisis" variable, which is a dummy variable that takes the value one from 2007Q3 onwards and zero otherwise.
-The bank-specific shock variable is an alternating dummy variable that takes the value one if there is a bad or unfavorable change in the distribution of the underlying shock variable of 1 (2,..., 9) decile(s) from one quarter to another and zero otherwise, whereas we portioned the distribution into 10 equal percentiles.
-In the first step of the Heckit Model, the left-hand side variable (LHS) is "Credit relationship" which is a dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise (Column 1, 3 and 5 of Panel A and B).
-The LHS variable for the second step is "Exposure change" in log differences (Column 2, 4 and 6 of Panel A and B).
-For the the right-hand side variables (RHS) we use all variables of the baseline regression model augmented by the interaction term between the idiosyncratic shock and the "Crisis" variable.
-Panel B shows the models' corresponding marginal effects at representative values whereas the table generally depicts estimation results of idiosyncratic shocks of the strengths one, two and four, that is an unfavourable change in the underlying variable's distribution from one quarter to another of one, two and four deciles, respectively.
-TABLE 5 IDIOSYNCRATIC CAPITAL SHOCK X CRISIS one decile change two decile change four decile change Panel A of this table shows the estimation results of the baseline Heckman Two-Step Correction Model augmented by an interaction term of a creditor and borrower bank-specific shock and the "Crisis" variable, which is a dummy variable that takes the value one from 2007Q3 onwards and zero otherwise.
-The bank-specific shock variable is an alternating dummy variable that takes the value one if there is a bad or unfavorable change in the distribution of the underlying shock variable of 1 (2,..., 9) decile(s) from one quarter to another and zero otherwise, whereas we portioned the distribution into 10 equal percentiles.
-In the first step of the Heckit Model, the left-hand side variable (LHS) is "Credit relationship" which is a dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise (Column 1, 3 and 5 of Panel A and B).
-The LHS variable for the second step is "Exposure change" in log differences (Column 2, 4 and 6 of Panel A and B).
-For the the right-hand side variables (RHS) we use all variables of the baseline regression model augmented by the interaction term between the idiosyncratic shock and the "Crisis" variable.
-Panel B shows the models' corresponding marginal effects at representative values whereas the table generally depicts estimation results of idiosyncratic shocks of the strengths one, two and three that is an unfavourable change in the underlying variable's distribution from one quarter to another of one, two and three deciles, respectively.
-TABLE 6 IDIOSYNCRATIC LIQUIDITY SHOCK X CRISIS one decile change two decile change three decile change Panel A of this table shows the estimation results of the baseline Heckman Two-Step Correction Model augmented by an interaction term of a creditor and borrower bank-specific shock and the "Crisis" variable, which is a dummy variable that takes the value one from 2007Q3 onwards and zero otherwise.
-The bank-specific shock variable is an alternating dummy variable that takes the value one if there is a bad or unfavorable change in the distribution of the underlying shock variable of 1 (2,..., 9) decile(s) from one quarter to another and zero otherwise, whereas we portioned the distribution into 10 equal percentiles.
-In the first step of the Heckit Model, the left-hand side variable (LHS) is "Credit relationship" which is a dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise (Column 1, 3 and 5 of Panel A and B).
-The LHS variable for the second step is "Exposure change" in log differences (Column 2, 4 and 6 of Panel A and B).
-For the the right-hand side variables (RHS) we use all variables of the baseline regression model augmented by the interaction term between the idiosyncratic shock and the "Crisis" variable.
-Panel B shows the models' corresponding marginal effects at representative values whereas the table generally depicts estimation results of idiosyncratic shocks of the strengths one, four and eight that is an unfavourable change in the underlying variable's distribution from one quarter to another of one, four and eight deciles, respectively.
-TABLE 7 IDIOSYNCRATIC CREDIT QUALITY SHOCK X CRISIS one decile change four decile change eight decile change TABLE 8 IDIOSYNCRATIC CAPITAL SHOCK X RELATIONSHIP
-This table presents the estimation results of the baseline Heckman Two-Step Correction Model augmented by an interaction term of a creditor and borrower bank-specific shock and the relationship proxie variables as well as the interaction terms' corresponding marginal effects at representative values.
-The shock variable is an alternating dummy variable that takes the value one if there is a bad or unfavorable change in the distribution of the underlying shock variable of 1 (2,..., 9) decile(s) from one quarter to another and zero otherwise, whereas we portioned the distribution into 10 equal percentiles.
-Proxie variables that account for the banks' relationship intensity are as follows.
-"Total relation span" counts the number of sustained quarters bank C and bank B interact with each other, either as creditors or borrowers, "Exposure t - 1" is the log pre-quarter exposure from the creditor bank C to borrower bank B and "Reciproc exposure" is the log reciprocal exposure from bank B to bank C
-In the first step of the Heckit Model, the left-hand side variable (LHS) is "Credit relationship" which is a dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise (Column 1 and 3).
-The LHS variable for the second step is "Exposure change" in log differences (Column 2 and 4).
-For the the right-hand side variables (RHS) we use all variables of the baseline regression model augmented by the interaction terms of the bank-specific shock variable and the relationship proxies whereas the table presents estimation results of an idiosyncratic shock of the strength five, that is an unfavourable change in the underlying variable's distribution from one quarter to another of five deciles.
-TABLE 9 IDIOSYNCRATIC LIQUIDITY SHOCK X RELATIONSHIP
-This table presents the estimation results of the baseline Heckman Two-Step Correction Model augmented by an interaction term of a creditor and borrower bank-specific shock and the relationship proxie variables as well as the interaction terms' corresponding marginal effects at representative values.
-The shock variable is an alternating dummy variable that takes the value one if there is a bad or unfavorable change in the distribution of the underlying shock variable of 1 (2,..., 9) decile(s) from one quarter to another and zero otherwise, whereas we portioned the distribution into 10 equal percentiles.
-Proxie variables that account for the banks' relationship intensity are as follows.
-"Total relation span" counts the number of sustained quarters bank C and bank B interact with each other, either as creditors or borrowers, "Exposure t - 1" is the log pre-quarter exposure from the creditor bank C to borrower bank B and "Reciproc exposure" is the log reciprocal exposure from bank B to bank C
-In the first step of the Heckit Model, the left-hand side variable (LHS) is "Credit relationship" which is a dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise (Column 1 and 3).
-The LHS variable for the second step is "Exposure change" in log differences (Column 2 and 4).
-For the the right-hand side variables (RHS) we use all variables of the baseline regression model augmented by the interaction terms of the bank-specific shock variable and the relationship proxies whereas the table presents estimation results of an idiosyncratic shock of the strength one, that is an unfavourable change in the underlying variable's distribution from one quarter to another of one decile.
-TABLE 10 IDIOSYNCRATIC CREDIT QUALITY SHOCK X RELATIONSHIP This table presents the estimation results of the baseline Heckman Two-Step Correction Model augmented by an interaction term of a creditor and borrower bank-specific shock and the relationship proxie variables as well as the interaction terms' corresponding marginal effects at representative values.
-The shock variable is an alternating dummy variable that takes the value one if there is a bad or unfavorable change in the distribution of the underlying shock variable of 1 (2,..., 9) decile(s) from one quarter to another and zero otherwise, whereas we portioned the distribution into 10 equal percentiles.
-Proxie variables that account for the banks' relationship intensity are as follows.
-"Total relation span" counts the number of sustained quarters bank C and bank B interact with each other, either as creditors or borrowers, "Exposure t - 1" is the log pre-quarter exposure from the creditor bank C to borrower bank B and "Reciproc exposure" is the log reciprocal exposure from bank B to bank C
-In the first step of the Heckit Model, the left-hand side variable (LHS) is "Credit relationship" which is a dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise (Column 1 and 3).
-The LHS variable for the second step is "Exposure change" in log differences (Column 2 and 4).
-For the the right-hand side variables (RHS) we use all variables of the baseline regression model augmented by the interaction terms of the bank-specific shock variable and the relationship proxies whereas the table presents in Panel A estimation results of an idiosyncratic shock of the strength one, that is an unfavourable change in the underlying variable's distribution from one quarter to another of one decile and in Panel B results of an idiosyncratic shocks of the strength five.
-This table presents the estimation results of the baseline Heckman Two-Step Correction Model augmented by an interaction term of a creditor and borrower bank specific shock and the relationship proxie variables as well as the interaction terms' corresponding marginal effects at representative values.
-The shock variable is an alternating dummy variable that takes the value one if there is a bad respectively unfavorable change in the distribution of the underlying shock variable of 1 (2,..., 9) decile(s) from one quarter to another and zero otherwise, whereas we portioned the distribution into 10 equal percentiles.
-Proxie variables that account for the banks' relationship intensity are as follows.
-"Total relation span" counts the number of sustained quarters bank C and bank B interact with each other, either as creditors or borrowers, "Exposure t - 1" is the log pre-quarter exposure from the creditor bank C to borrower bank B and "Reciproc exposure" is the log reciprocal exposure from bank B to bank C
-In the first step of the Heckit Model, the left hand side variable (LHS) is "Credit relationship" which is a dummy variable that takes the value one if there is a distinct credit relationship from a creditor bank C to a borrower bank B and zero otherwise (Column 1 and 3).
-The LHS variable for the second step is "Exposure change" in log differences (Column 2 and 4).
-For the the right hand side variables (RHS) we use all variables of the baseline regression model augmented by the interaction terms of the bank-specific shock variable and the relationship proxies whereas the table presents in Panel A estimation results of an idiosyncratic shock the strength one, that is an unfavourable change in the underlying variable's distribution from one quarter to another of one decile and in Panel B results of an idiosyncratic shock the strength five.

examples/BERT - Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf DELETED Viewed

Binary file (775 kB)

examples/BERT_body.txt DELETED Viewed

@@ -1 +0,0 @@

- Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . There is a long history of pre-training general language representations , and we briefly review the most widely-used approaches in this section . Learning widely applicable representations of words has been an active area of research for decades , including non-neural ( Brown et al. , 1992 ; Ando and Zhang , 2005 ; Blitzer et al. , 2006 ) and neural Pennington et al. , 2014 ) methods . Pre-trained word embeddings are an integral part of modern NLP systems , offering significant improvements over embeddings learned from scratch ( Turian et al. , 2010 ) . To pretrain word embedding vectors , left-to-right language modeling objectives have been used ( Mnih and Hinton , 2009 ) , as well as objectives to discriminate correct from incorrect words in left and right context . These approaches have been generalized to coarser granularities , such as sentence embeddings Logeswaran and Lee , 2018 ) or paragraph embeddings ( Le and Mikolov , 2014 ) . To train sentence representations , prior work has used objectives to rank candidate next sentences ( Jernite et al. , 2017 ; Logeswaran and Lee , 2018 ) , left-to-right generation of next sentence words given a representation of the previous sentence , or denoising autoencoder derived objectives ( Hill et al. , 2016 ) . ELMo and its predecessor ( Peters et al. , 2017 ( Peters et al. , , 2018a generalize traditional word embedding research along a different dimension . They extract context-sensitive features from a left-to-right and a right-to-left language model . The contextual representation of each token is the concatenation of the left-to-right and right-to-left representations . When integrating contextual word embeddings with existing task-specific architectures , ELMo advances the state of the art for several major NLP benchmarks ( Peters et al. , 2018a ) including question answering ( Rajpurkar et al. , 2016 ) , sentiment analysis ( Socher et al. , 2013 ) , and named entity recognition ( Tjong Kim Sang and De Meulder , 2003 ) . Melamud et al . ( 2016 ) proposed learning contextual representations through a task to predict a single word from both left and right context using LSTMs . Similar to ELMo , their model is feature-based and not deeply bidirectional . Fedus et al . ( 2018 ) shows that the cloze task can be used to improve the robustness of text generation models . As with the feature-based approaches , the first works in this direction only pre-trained word embedding parameters from unlabeled text ( Collobert and Weston , 2008 ) . More recently , sentence or document encoders which produce contextual token representations have been pre-trained from unlabeled text and fine-tuned for a supervised downstream task ( Dai and Le , 2015 ; Howard and Ruder , 2018 ; Radford et al. , 2018 ) . The advantage of these approaches is that few parameters need to be learned from scratch . At least partly due to this advantage , OpenAI GPT ( Radford et al. , 2018 ) achieved previously state-of-the-art results on many sentencelevel tasks from the GLUE benchmark ( Wang et al. , 2018a ) . Left-to-right language model- BERT BERT E [ CLS ] E 1 E [ SEP ] ... E N E 1 ' ... E M ' C T 1 T [ SEP ] ... T N T 1 ' ... T M ' [ CLS ] Tok 1 [ SEP ] ... Tok N Tok 1 ... TokM Question Paragraph Start/End Span BERT E [ CLS ] E 1 E [ SEP ] ... E N E 1 ' ... E M ' C T 1 T [ SEP ] ... T N T 1 ' ... T M ' [ CLS ] Tok 1 [ SEP ] ... Figure 1 : Overall pre-training and fine-tuning procedures for BERT . Apart from output layers , the same architectures are used in both pre-training and fine-tuning . The same pre-trained model parameters are used to initialize models for different down-stream tasks . During fine-tuning , all parameters are fine-tuned . [ CLS ] is a special symbol added in front of every input example , and [ SEP ] is a special separator token ( e.g . separating questions/answers ) . ing and auto-encoder objectives have been used for pre-training such models ( Howard and Ruder , 2018 ; Radford et al. , 2018 ; Dai and Le , 2015 ) . There has also been work showing effective transfer from supervised tasks with large datasets , such as natural language inference ( Conneau et al. , 2017 ) and machine translation ( McCann et al. , 2017 ) . Computer vision research has also demonstrated the importance of transfer learning from large pre-trained models , where an effective recipe is to fine-tune models pre-trained with Ima-geNet ( Deng et al. , 2009 ; Yosinski et al. , 2014 ) . We introduce BERT and its detailed implementation in this section . There are two steps in our framework : pre-training and fine-tuning . During pre-training , the model is trained on unlabeled data over different pre-training tasks . For finetuning , the BERT model is first initialized with the pre-trained parameters , and all of the parameters are fine-tuned using labeled data from the downstream tasks . Each downstream task has separate fine-tuned models , even though they are initialized with the same pre-trained parameters . The question-answering example in Figure 1 will serve as a running example for this section . A distinctive feature of BERT is its unified architecture across different tasks . There is mini-mal difference between the pre-trained architecture and the final downstream architecture . Model Architecture BERT 's model architecture is a multi-layer bidirectional Transformer encoder based on the original implementation described in Vaswani et al . ( 2017 ) and released in the tensor2tensor library . 1 Because the use of Transformers has become common and our implementation is almost identical to the original , we will omit an exhaustive background description of the model architecture and refer readers to Vaswani et al . ( 2017 ) as well as excellent guides such as `` The Annotated Transformer . '' 2 In this work , we denote the number of layers ( i.e. , Transformer blocks ) as L , the hidden size as H , and the number of self-attention heads as A . 3 We primarily report results on two model sizes : BERT BASE ( L=12 , H=768 , A=12 , Total Param-eters=110M ) and BERT LARGE ( L=24 , H=1024 , A=16 , Total Parameters=340M ) . BERT BASE was chosen to have the same model size as OpenAI GPT for comparison purposes . Critically , however , the BERT Transformer uses bidirectional self-attention , while the GPT Transformer uses constrained self-attention where every token can only attend to context to its left . 4 Input/Output Representations To make BERT handle a variety of down-stream tasks , our input representation is able to unambiguously represent both a single sentence and a pair of sentences ( e.g. , Question , Answer ) in one token sequence . Throughout this work , a `` sentence '' can be an arbitrary span of contiguous text , rather than an actual linguistic sentence . A `` sequence '' refers to the input token sequence to BERT , which may be a single sentence or two sentences packed together . We use WordPiece embeddings ( Wu et al. , 2016 ) with a 30,000 token vocabulary . The first token of every sequence is always a special classification token ( [ CLS ] ) . The final hidden state corresponding to this token is used as the aggregate sequence representation for classification tasks . Sentence pairs are packed together into a single sequence . We differentiate the sentences in two ways . First , we separate them with a special token ( [ SEP ] ) . Second , we add a learned embedding to every token indicating whether it belongs to sentence A or sentence B . As shown in Figure 1 , we denote input embedding as E , the final hidden vector of the special [ CLS ] token as C ∈ R H , and the final hidden vector for the i th input token as T i ∈ R H . For a given token , its input representation is constructed by summing the corresponding token , segment , and position embeddings . A visualization of this construction can be seen in Figure 2 . Unlike Peters et al . ( 2018a ) and Radford et al . ( 2018 ) , we do not use traditional left-to-right or right-to-left language models to pre-train BERT . Instead , we pre-train BERT using two unsupervised tasks , described in this section . This step is presented in the left part of Figure 1 . Task # 1 : Masked LM Intuitively , it is reasonable to believe that a deep bidirectional model is strictly more powerful than either a left-to-right model or the shallow concatenation of a left-toright and a right-to-left model . Unfortunately , standard conditional language models can only be trained left-to-right or right-to-left , since bidirectional conditioning would allow each word to indirectly `` see itself '' , and the model could trivially predict the target word in a multi-layered context . former is often referred to as a `` Transformer encoder '' while the left-context-only version is referred to as a `` Transformer decoder '' since it can be used for text generation . In order to train a deep bidirectional representation , we simply mask some percentage of the input tokens at random , and then predict those masked tokens . We refer to this procedure as a `` masked LM '' ( MLM ) , although it is often referred to as a Cloze task in the literature ( Taylor , 1953 ) . In this case , the final hidden vectors corresponding to the mask tokens are fed into an output softmax over the vocabulary , as in a standard LM . In all of our experiments , we mask 15 % of all WordPiece tokens in each sequence at random . In contrast to denoising auto-encoders ( Vincent et al. , 2008 ) , we only predict the masked words rather than reconstructing the entire input . Although this allows us to obtain a bidirectional pre-trained model , a downside is that we are creating a mismatch between pre-training and fine-tuning , since the [ MASK ] token does not appear during fine-tuning . To mitigate this , we do not always replace `` masked '' words with the actual [ MASK ] token . The training data generator chooses 15 % of the token positions at random for prediction . If the i-th token is chosen , we replace the i-th token with ( 1 ) the [ MASK ] token 80 % of the time ( 2 ) a random token 10 % of the time ( 3 ) the unchanged i-th token 10 % of the time . Then , T i will be used to predict the original token with cross entropy loss . We compare variations of this procedure in Appendix C.2 . Many important downstream tasks such as Question Answering ( QA ) and Natural Language Inference ( NLI ) are based on understanding the relationship between two sentences , which is not directly captured by language modeling . In order to train a model that understands sentence relationships , we pre-train for a binarized next sentence prediction task that can be trivially generated from any monolingual corpus . Specifically , when choosing the sentences A and B for each pretraining example , 50 % of the time B is the actual next sentence that follows A ( labeled as IsNext ) , and 50 % of the time it is a random sentence from the corpus ( labeled as NotNext ) . As we show in Figure 1 , C is used for next sentence prediction ( NSP ) . 5 Despite its simplicity , we demonstrate in Section 5.1 that pre-training towards this task is very beneficial to both QA and NLI . 6 he likes play # # ing [ SEP ] my dog is cute [ SEP ] Input E [ CLS ] E he E likes E play E # # ing E [ SEP ] E my E dog E is E cute E [ SEP ] Token Embeddings E A E B E B E B E B E B E A E A E A E A E A Segment Embeddings E 0 E 6 E 7 E 8 E 9 E 10 E 1 E 2 E 3 E 4 E 5 Position Embeddings Figure 2 : BERT input representation . The input embeddings are the sum of the token embeddings , the segmentation embeddings and the position embeddings . The NSP task is closely related to representationlearning objectives used in Jernite et al . 2017and Logeswaran and Lee ( 2018 ) . However , in prior work , only sentence embeddings are transferred to down-stream tasks , where BERT transfers all parameters to initialize end-task model parameters . Pre-training data The pre-training procedure largely follows the existing literature on language model pre-training . For the pre-training corpus we use the BooksCorpus ( 800M words ) and English Wikipedia ( 2,500M words ) . For Wikipedia we extract only the text passages and ignore lists , tables , and headers . It is critical to use a document-level corpus rather than a shuffled sentence-level corpus such as the Billion Word Benchmark ( Chelba et al. , 2013 ) in order to extract long contiguous sequences . Fine-tuning is straightforward since the selfattention mechanism in the Transformer allows BERT to model many downstream taskswhether they involve single text or text pairs-by swapping out the appropriate inputs and outputs . For applications involving text pairs , a common pattern is to independently encode text pairs before applying bidirectional cross attention , such as Parikh et al . 2016 ; Seo et al . ( 2017 ) . BERT instead uses the self-attention mechanism to unify these two stages , as encoding a concatenated text pair with self-attention effectively includes bidirectional cross attention between two sentences . For each task , we simply plug in the taskspecific inputs and outputs into BERT and finetune all the parameters end-to-end . At the input , sentence A and sentence B from pre-training are analogous to ( 1 ) sentence pairs in paraphrasing , ( 2 ) hypothesis-premise pairs in entailment , ( 3 ) question-passage pairs in question answering , and ( 4 ) a degenerate text-∅ pair in text classification or sequence tagging . At the output , the token representations are fed into an output layer for tokenlevel tasks , such as sequence tagging or question answering , and the [ CLS ] representation is fed into an output layer for classification , such as entailment or sentiment analysis . Compared to pre-training , fine-tuning is relatively inexpensive . All of the results in the paper can be replicated in at most 1 hour on a single Cloud TPU , or a few hours on a GPU , starting from the exact same pre-trained model . 7 We describe the task-specific details in the corresponding subsections of Section 4 . More details can be found in Appendix A.5 . In this section , we present BERT fine-tuning results on 11 NLP tasks . The General Language Understanding Evaluation ( GLUE ) benchmark ( Wang et al. , 2018a ) is a collection of diverse natural language understanding tasks . Detailed descriptions of GLUE datasets are included in Appendix B.1 . To fine-tune on GLUE , we represent the input sequence ( for single sentence or sentence pairs ) as described in Section 3 , and use the final hidden vector C ∈ R H corresponding to the first input token ( [ CLS ] ) as the aggregate representation . The only new parameters introduced during fine-tuning are classification layer weights W ∈ R K×H , where K is the number of labels . We compute a standard classification loss with C and W , i.e. , log ( softmax ( CW T ) ) . Table 1 : GLUE Test results , scored by the evaluation server ( https : //gluebenchmark.com/leaderboard ) . The number below each task denotes the number of training examples . The `` Average '' column is slightly different than the official GLUE score , since we exclude the problematic WNLI set . 8 BERT and OpenAI GPT are singlemodel , single task . F1 scores are reported for QQP and MRPC , Spearman correlations are reported for STS-B , and accuracy scores are reported for the other tasks . We exclude entries that use BERT as one of their components . We use a batch size of 32 and fine-tune for 3 epochs over the data for all GLUE tasks . For each task , we selected the best fine-tuning learning rate ( among 5e-5 , 4e-5 , 3e-5 , and 2e-5 ) on the Dev set . Additionally , for BERT LARGE we found that finetuning was sometimes unstable on small datasets , so we ran several random restarts and selected the best model on the Dev set . With random restarts , we use the same pre-trained checkpoint but perform different fine-tuning data shuffling and classifier layer initialization . 9 Results are presented in Table 1 . Both BERT BASE and BERT LARGE outperform all systems on all tasks by a substantial margin , obtaining 4.5 % and 7.0 % respective average accuracy improvement over the prior state of the art . Note that BERT BASE and OpenAI GPT are nearly identical in terms of model architecture apart from the attention masking . For the largest and most widely reported GLUE task , MNLI , BERT obtains a 4.6 % absolute accuracy improvement . On the official GLUE leaderboard 10 , BERT LARGE obtains a score of 80.5 , compared to OpenAI GPT , which obtains 72.8 as of the date of writing . We find that BERT LARGE significantly outperforms BERT BASE across all tasks , especially those with very little training data . The effect of model size is explored more thoroughly in Section 5.2 . The Stanford Question Answering Dataset ( SQuAD v1.1 ) is a collection of 100k crowdsourced question/answer pairs ( Rajpurkar et al. , 2016 ) . Given a question and a passage from Wikipedia containing the answer , the task is to predict the answer text span in the passage . As shown in Figure 1 , in the question answering task , we represent the input question and passage as a single packed sequence , with the question using the A embedding and the passage using the B embedding . We only introduce a start vector S ∈ R H and an end vector E ∈ R H during fine-tuning . The probability of word i being the start of the answer span is computed as a dot product between T i and S followed by a softmax over all of the words in the paragraph : P i = e S•T i j e S•T j . The analogous formula is used for the end of the answer span . The score of a candidate span from position i to position j is defined as S•T i + E•T j , and the maximum scoring span where j ≥ i is used as a prediction . The training objective is the sum of the log-likelihoods of the correct start and end positions . We fine-tune for 3 epochs with a learning rate of 5e-5 and a batch size of 32 . Table 2 shows top leaderboard entries as well as results from top published systems ( Seo et al. , 2017 ; Clark and Gardner , 2018 ; Peters et al. , 2018a ; Hu et al. , 2018 ) . The top results from the SQuAD leaderboard do not have up-to-date public system descriptions available , 11 and are allowed to use any public data when training their systems . We therefore use modest data augmentation in our system by first fine-tuning on TriviaQA ( Joshi et al. , 2017 ) befor fine-tuning on SQuAD . Our best performing system outperforms the top leaderboard system by +1.5 F1 in ensembling and +1.3 F1 as a single system . In fact , our single BERT model outperforms the top ensemble system in terms of F1 score . Without TriviaQA fine- tuning data , we only lose 0.1-0.4 F1 , still outperforming all existing systems by a wide margin . 12 The SQuAD 2.0 task extends the SQuAD 1.1 problem definition by allowing for the possibility that no short answer exists in the provided paragraph , making the problem more realistic . We use a simple approach to extend the SQuAD v1.1 BERT model for this task . We treat questions that do not have an answer as having an answer span with start and end at the [ CLS ] token . The probability space for the start and end answer span positions is extended to include the position of the [ CLS ] token . For prediction , we compare the score of the no-answer span : s null = S•C + E•C to the score of the best non-null span 12 The TriviaQA data we used consists of paragraphs from TriviaQA-Wiki formed of the first 400 tokens in documents , that contain at least one of the provided possible answers . Dev Test ESIM+GloVe s i , j = max j≥i S•T i + E•T j . We predict a non-null answer whenŝ i , j > s null + τ , where the threshold τ is selected on the dev set to maximize F1 . We did not use TriviaQA data for this model . We fine-tuned for 2 epochs with a learning rate of 5e-5 and a batch size of 48 . The results compared to prior leaderboard entries and top published work ( Sun et al. , 2018 ; Wang et al. , 2018b ) are shown in Table 3 , excluding systems that use BERT as one of their components . We observe a +5.1 F1 improvement over the previous best system . The Situations With Adversarial Generations ( SWAG ) dataset contains 113k sentence-pair completion examples that evaluate grounded commonsense inference ( Zellers et al. , 2018 ) . Given a sentence , the task is to choose the most plausible continuation among four choices . When fine-tuning on the SWAG dataset , we construct four input sequences , each containing the concatenation of the given sentence ( sentence A ) and a possible continuation ( sentence B ) . The only task-specific parameters introduced is a vector whose dot product with the [ CLS ] token representation C denotes a score for each choice which is normalized with a softmax layer . We fine-tune the model for 3 epochs with a learning rate of 2e-5 and a batch size of 16 . Results are presented in Table 4 . BERT LARGE outperforms the authors ' baseline ESIM+ELMo system by +27.1 % and OpenAI GPT by 8.3 % . In this section , we perform ablation experiments over a number of facets of BERT in order to better understand their relative importance . Additional Table 5 : Ablation over the pre-training tasks using the BERT BASE architecture . `` No NSP '' is trained without the next sentence prediction task . `` LTR & No NSP '' is trained as a left-to-right LM without the next sentence prediction , like OpenAI GPT . `` + BiLSTM '' adds a randomly initialized BiLSTM on top of the `` LTR + No NSP '' model during fine-tuning . ablation studies can be found in Appendix C . We demonstrate the importance of the deep bidirectionality of BERT by evaluating two pretraining objectives using exactly the same pretraining data , fine-tuning scheme , and hyperparameters as BERT BASE : No NSP : A bidirectional model which is trained using the `` masked LM '' ( MLM ) but without the `` next sentence prediction '' ( NSP ) task . A left-context-only model which is trained using a standard Left-to-Right ( LTR ) LM , rather than an MLM . The left-only constraint was also applied at fine-tuning , because removing it introduced a pre-train/fine-tune mismatch that degraded downstream performance . Additionally , this model was pre-trained without the NSP task . This is directly comparable to OpenAI GPT , but using our larger training dataset , our input representation , and our fine-tuning scheme . We first examine the impact brought by the NSP task . In Table 5 , we show that removing NSP hurts performance significantly on QNLI , MNLI , and SQuAD 1.1 . Next , we evaluate the impact of training bidirectional representations by comparing `` No NSP '' to `` LTR & No NSP '' . The LTR model performs worse than the MLM model on all tasks , with large drops on MRPC and SQuAD . For SQuAD it is intuitively clear that a LTR model will perform poorly at token predictions , since the token-level hidden states have no rightside context . In order to make a good faith attempt at strengthening the LTR system , we added a randomly initialized BiLSTM on top . This does significantly improve results on SQuAD , but the results are still far worse than those of the pretrained bidirectional models . The BiLSTM hurts performance on the GLUE tasks . We recognize that it would also be possible to train separate LTR and RTL models and represent each token as the concatenation of the two models , as ELMo does . However : ( a ) this is twice as expensive as a single bidirectional model ; ( b ) this is non-intuitive for tasks like QA , since the RTL model would not be able to condition the answer on the question ; ( c ) this it is strictly less powerful than a deep bidirectional model , since it can use both left and right context at every layer . In this section , we explore the effect of model size on fine-tuning task accuracy . We trained a number of BERT models with a differing number of layers , hidden units , and attention heads , while otherwise using the same hyperparameters and training procedure as described previously . Results on selected GLUE tasks are shown in Table 6 . In this table , we report the average Dev Set accuracy from 5 random restarts of fine-tuning . We can see that larger models lead to a strict accuracy improvement across all four datasets , even for MRPC which only has 3,600 labeled training examples , and is substantially different from the pre-training tasks . It is also perhaps surprising that we are able to achieve such significant improvements on top of models which are already quite large relative to the existing literature . For example , the largest Transformer explored in Vaswani et al . ( 2017 ) is ( L=6 , H=1024 , A=16 ) with 100M parameters for the encoder , and the largest Transformer we have found in the literature is ( L=64 , H=512 , A=2 ) with 235M parameters ( Al-Rfou et al. , 2018 ) . By contrast , BERT BASE contains 110M parameters and BERT LARGE contains 340M parameters . It has long been known that increasing the model size will lead to continual improvements on large-scale tasks such as machine translation and language modeling , which is demonstrated by the LM perplexity of held-out training data shown in Table 6 . However , we believe that this is the first work to demonstrate convincingly that scaling to extreme model sizes also leads to large improvements on very small scale tasks , provided that the model has been sufficiently pre-trained . Peters et al . ( 2018b ) presented mixed results on the downstream task impact of increasing the pre-trained bi-LM size from two to four layers and Melamud et al . ( 2016 ) mentioned in passing that increasing hidden dimension size from 200 to 600 helped , but increasing further to 1,000 did not bring further improvements . Both of these prior works used a featurebased approach -we hypothesize that when the model is fine-tuned directly on the downstream tasks and uses only a very small number of randomly initialized additional parameters , the taskspecific models can benefit from the larger , more expressive pre-trained representations even when downstream task data is very small . All of the BERT results presented so far have used the fine-tuning approach , where a simple classification layer is added to the pre-trained model , and all parameters are jointly fine-tuned on a downstream task . However , the feature-based approach , where fixed features are extracted from the pretrained model , has certain advantages . First , not all tasks can be easily represented by a Transformer encoder architecture , and therefore require a task-specific model architecture to be added . Second , there are major computational benefits to pre-compute an expensive representation of the training data once and then run many experiments with cheaper models on top of this representation . In this section , we compare the two approaches by applying BERT to the CoNLL-2003 Named Entity Recognition ( NER ) task ( Tjong Kim Sang and De Meulder , 2003 ) . In the input to BERT , we use a case-preserving WordPiece model , and we include the maximal document context provided by the data . Following standard practice , we formulate this as a tagging task but do not use a CRF Table 6 : Ablation over BERT model size . # L = the number of layers ; # H = hidden size ; # A = number of attention heads . `` LM ( ppl ) '' is the masked LM perplexity of held-out training data . Dev F1 Test F1 ELMo ( Peters et al. , 2018a ) 95.7 92.2 CVT -92.6 CSE ( Akbik et al. , 2018 layer in the output . We use the representation of the first sub-token as the input to the token-level classifier over the NER label set . To ablate the fine-tuning approach , we apply the feature-based approach by extracting the activations from one or more layers without fine-tuning any parameters of BERT . These contextual embeddings are used as input to a randomly initialized two-layer 768-dimensional BiLSTM before the classification layer . Results are presented in Table 7 . BERT LARGE performs competitively with state-of-the-art methods . The best performing method concatenates the token representations from the top four hidden layers of the pre-trained Transformer , which is only 0.3 F1 behind fine-tuning the entire model . This demonstrates that BERT is effective for both finetuning and feature-based approaches . Recent empirical improvements due to transfer learning with language models have demonstrated that rich , unsupervised pre-training is an integral part of many language understanding systems . In particular , these results enable even low-resource tasks to benefit from deep unidirectional architectures . Our major contribution is further generalizing these findings to deep bidirectional architectures , allowing the same pre-trained model to successfully tackle a broad set of NLP tasks . Masked LM and the Masking Procedure Assuming the unlabeled sentence is my dog is hairy , and during the random masking procedure we chose the 4-th token ( which corresponding to hairy ) , our masking procedure can be further illustrated by • 10 % of the time : Replace the word with a random word , e.g. , my dog is hairy → my dog is apple • 10 % of the time : Keep the word unchanged , e.g. , my dog is hairy → my dog is hairy . The purpose of this is to bias the representation towards the actual observed word . The advantage of this procedure is that the Transformer encoder does not know which words it will be asked to predict or which have been replaced by random words , so it is forced to keep a distributional contextual representation of every input token . Additionally , because random replacement only occurs for 1.5 % of all tokens ( i.e. , 10 % of 15 % ) , this does not seem to harm the model 's language understanding capability . In Section C.2 , we evaluate the impact this procedure . Compared to standard langauge model training , the masked LM only make predictions on 15 % of tokens in each batch , which suggests that more pre-training steps may be required for the model to converge . In Section C.1 we demonstrate that MLM does converge marginally slower than a leftto-right model ( which predicts every token ) , but the empirical improvements of the MLM model far outweigh the increased training cost . T 1 T 2 T N ... ... ... ... ... E 1 E 2 E N ... T 1 T 2 T N ... E 1 E 2 E N ... T 1 T 2 T N ... E 1 E 2 E N ... Next Sentence Prediction The next sentence prediction task can be illustrated in the following examples . To generate each training input sequence , we sample two spans of text from the corpus , which we refer to as `` sentences '' even though they are typically much longer than single sentences ( but can be shorter also ) . The first sentence receives the A embedding and the second receives the B embedding . 50 % of the time B is the actual next sentence that follows A and 50 % of the time it is a random sentence , which is done for the `` next sentence prediction '' task . They are sampled such that the combined length is ≤ 512 tokens . The LM masking is applied after WordPiece tokenization with a uniform masking rate of 15 % , and no special consideration given to partial word pieces . We train with batch size of 256 sequences ( 256 sequences * 512 tokens = 128,000 tokens/batch ) for 1,000,000 steps , which is approximately 40 epochs over the 3.3 billion word corpus . We use Adam with learning rate of 1e-4 , β 1 = 0.9 , β 2 = 0.999 , L2 weight decay of 0.01 , learning rate warmup over the first 10,000 steps , and linear decay of the learning rate . We use a dropout probability of 0.1 on all layers . We use a gelu activation ( Hendrycks and Gimpel , 2016 ) rather than the standard relu , following OpenAI GPT . The training loss is the sum of the mean masked LM likelihood and the mean next sentence prediction likelihood . Training of BERT BASE was performed on 4 Cloud TPUs in Pod configuration ( 16 TPU chips total ) . 13 Training of BERT LARGE was performed on 16 Cloud TPUs ( 64 TPU chips total ) . Each pretraining took 4 days to complete . Longer sequences are disproportionately expensive because attention is quadratic to the sequence length . To speed up pretraing in our experiments , we pre-train the model with sequence length of 128 for 90 % of the steps . Then , we train the rest 10 % of the steps of sequence of 512 to learn the positional embeddings . For fine-tuning , most model hyperparameters are the same as in pre-training , with the exception of the batch size , learning rate , and number of training epochs . The dropout probability was always kept at 0.1 . The optimal hyperparameter values are task-specific , but we found the following range of possible values to work well across all tasks : • Batch size : 16 , 32 • Learning rate ( Adam ) : 5e-5 , 3e-5 , 2e-5 • Number of epochs : 2 , 3 , 4 We also observed that large data sets ( e.g. , 100k+ labeled training examples ) were far less sensitive to hyperparameter choice than small data sets . Fine-tuning is typically very fast , so it is reasonable to simply run an exhaustive search over the above parameters and choose the model that performs best on the development set . OpenAI GPT Here we studies the differences in recent popular representation learning models including ELMo , OpenAI GPT and BERT . The comparisons between the model architectures are shown visually in Figure 3 . Note that in addition to the architecture differences , BERT and OpenAI GPT are finetuning approaches , while ELMo is a feature-based approach . The most comparable existing pre-training method to BERT is OpenAI GPT , which trains a left-to-right Transformer LM on a large text corpus . In fact , many of the design decisions in BERT were intentionally made to make it as close to GPT as possible so that the two methods could be minimally compared . The core argument of this work is that the bi-directionality and the two pretraining tasks presented in Section 3.1 account for the majority of the empirical improvements , but we do note that there are several other differences between how BERT and GPT were trained : • GPT is trained on the BooksCorpus ( 800M words ) ; BERT is trained on the BooksCorpus ( 800M words ) and Wikipedia ( 2,500M words ) . • • GPT was trained for 1M steps with a batch size of 32,000 words ; BERT was trained for 1M steps with a batch size of 128,000 words . • GPT used the same learning rate of 5e-5 for all fine-tuning experiments ; BERT chooses a task-specific fine-tuning learning rate which performs the best on the development set . To isolate the effect of these differences , we perform ablation experiments in Section 5.1 which demonstrate that the majority of the improvements are in fact coming from the two pre-training tasks and the bidirectionality they enable . The illustration of fine-tuning BERT on different tasks can be seen in Figure 4 . Our task-specific models are formed by incorporating BERT with one additional output layer , so a minimal number of parameters need to be learned from scratch . Among the tasks , Our GLUE results in Table1 are obtained from https : //gluebenchmark.com/ leaderboard and https : //blog . openai.com/language-unsupervised . The GLUE benchmark includes the following datasets , the descriptions of which were originally summarized in Wang et al . ( 2018a ) : MNLI Multi-Genre Natural Language Inference is a large-scale , crowdsourced entailment classification task ( Williams et al. , 2018 ) . Given a pair of sentences , the goal is to predict whether the second sentence is an entailment , contradiction , or neutral with respect to the first one . QQP Quora Question Pairs is a binary classification task where the goal is to determine if two questions asked on Quora are semantically equivalent . QNLI Question Natural Language Inference is a version of the Stanford Question Answering Dataset ( Rajpurkar et al. , 2016 ) which has been converted to a binary classification task ( Wang et al. , 2018a ) . The positive examples are ( question , sentence ) pairs which do contain the correct answer , and the negative examples are ( question , sentence ) from the same paragraph which do not contain the answer . BERT E [ CLS ] E 1 E [ SEP ] . .. E N E 1 ' ... E M ' C T 1 T [ SEP ] ... T N T 1 ' ... T M ' [ CLS ] Tok 1 [ SEP ] ... Tok N Tok 1 ... Tok M Question Paragraph BERT E [ CLS ] E 1 E 2 E N C T 1 T 2 T N Single Sentence ... ... BERT Tok 1 Tok 2 Tok N ... [ CLS ] E [ CLS ] E 1 E 2 E N C T 1 T 2 T N Single Sentence B-PER O O ... ... E [ CLS ] E 1 E [ SEP ] Class Label ... E N E 1 ' ... E M ' C T 1 T [ SEP ] ... T N T 1 ' ... The Stanford Sentiment Treebank is a binary single-sentence classification task consisting of sentences extracted from movie reviews with human annotations of their sentiment ( Socher et al. , 2013 ) . CoLA The Corpus of Linguistic Acceptability is a binary single-sentence classification task , where the goal is to predict whether an English sentence is linguistically `` acceptable '' or not ( Warstadt et al. , 2018 ) . The Semantic Textual Similarity Benchmark is a collection of sentence pairs drawn from news headlines and other sources ( Cer et al. , 2017 ) . They were annotated with a score from 1 to 5 denoting how similar the two sentences are in terms of semantic meaning . MRPC Microsoft Research Paraphrase Corpus consists of sentence pairs automatically extracted from online news sources , with human annotations for whether the sentences in the pair are semantically equivalent ( Dolan and Brockett , 2005 ) . RTE Recognizing Textual Entailment is a binary entailment task similar to MNLI , but with much less training data ( Bentivogli et al. , 2009 ) . 14 WNLI Winograd NLI is a small natural language inference dataset ( Levesque et al. , 2011 ) . The GLUE webpage notes that there are issues with the construction of this dataset , 15 and every trained system that 's been submitted to GLUE has performed worse than the 65.1 baseline accuracy of predicting the majority class . We therefore exclude this set to be fair to OpenAI GPT . For our GLUE submission , we always predicted the ma-jority class . C.1 Effect of Number of Training Steps Figure 5 presents MNLI Dev accuracy after finetuning from a checkpoint that has been pre-trained for k steps . This allows us to answer the following questions : 1 . Question : Does BERT really need such a large amount of pre-training ( 128,000 words/batch * 1,000,000 steps ) to achieve high fine-tuning accuracy ? Answer : Yes , BERT BASE achieves almost 1.0 % additional accuracy on MNLI when trained on 1M steps compared to 500k steps . 2 . Question : Does MLM pre-training converge slower than LTR pre-training , since only 15 % of words are predicted in each batch rather than every word ? Answer : The MLM model does converge slightly slower than the LTR model . However , in terms of absolute accuracy the MLM model begins to outperform the LTR model almost immediately . In Section 3.1 , we mention that BERT uses a mixed strategy for masking the target tokens when pre-training with the masked language model ( MLM ) objective . The following is an ablation study to evaluate the effect of different masking strategies . Note that the purpose of the masking strategies is to reduce the mismatch between pre-training and fine-tuning , as the [ MASK ] symbol never appears during the fine-tuning stage . We report the Dev results for both MNLI and NER . For NER , we report both fine-tuning and feature-based approaches , as we expect the mismatch will be amplified for the feature-based approach as the model will not have the chance to adjust the representations . The results are presented in Table 8 . In the table , MASK means that we replace the target token with the [ MASK ] symbol for MLM ; SAME means that we keep the target token as is ; RND means that we replace the target token with another random token . The numbers in the left part of the table represent the probabilities of the specific strategies used during MLM pre-training ( BERT uses 80 % , 10 % , 10 % ) . The right part of the paper represents the Dev set results . For the feature-based approach , we concatenate the last 4 layers of BERT as the features , which was shown to be the best approach in Section 5.3 . From the table it can be seen that fine-tuning is surprisingly robust to different masking strategies . However , as expected , using only the MASK strategy was problematic when applying the featurebased approach to NER . Interestingly , using only the RND strategy performs much worse than our strategy as well . https : //github.com/tensorflow/tensor2tensor 2 http : //nlp.seas.harvard.edu/2018/04/03/attention.html 3 In all cases we set the feed-forward/filter size to be 4H , i.e. , 3072 for the H = 768 and 4096 for the H = 1024 . The final model achieves 97 % -98 % accuracy on NSP.6 The vector C is not a meaningful sentence representation without fine-tuning , since it was trained with NSP . For example , the BERT SQuAD model can be trained in around 30 minutes on a single Cloud TPU to achieve a Dev F1 score of 91.0 % .8 See ( 10 ) in https : //gluebenchmark.com/faq . The GLUE data set distribution does not include the Test labels , and we only made a single GLUE evaluation server submission for each of BERTBASE and BERTLARGE.10 https : //gluebenchmark.com/leaderboard QANet is described inYu et al . ( 2018 ) , but the system has improved substantially after publication . https : //cloudplatform.googleblog.com/2018/06/Cloud-TPU-now-offers-preemptible-pricing-and-globalavailability.html Note that we only report single-task fine-tuning results in this paper . A multitask fine-tuning approach could potentially push the performance even further . For example , we did observe substantial improvements on RTE from multitask training with MNLI.15 https : //gluebenchmark.com/faq

examples/BERT_paper.pdf DELETED Viewed

Binary file (775 kB)

examples/H01-1042.pdf DELETED Viewed

Binary file (43.1 kB)

examples/H01-1042_body.txt DELETED Viewed

@@ -1 +0,0 @@

- Machine translation evaluation and language learner evaluation have been associated for many years , for example [ 5 , 7 ] . One attractive aspect of language learner evaluation which recommends it to machine translation evaluation is the expectation that the produced language is not perfect , well-formed language . Language learner evaluation systems are geared towards determining the specific kinds of errors that language learners make . Additionally , language learner evaluation , more than many MT evaluations , seeks to build models of language acquisition which could parallel ( but not correspond directly to ) the development of MT systems . These models frequently are feature-based and may provide informative metrics for diagnostic evaluation for system designers and users . In a recent experiment along these lines , Jones and Rusk [ 2 ] present a reasonable idea for measuring intelligibility , that of trying to score the English output of translation systems using a wide variety of metrics . In essence , they are looking at the degree to which a given output is English and comparing this to humanproduced English . Their goal was to find a scoring function for the quality of English that can enable the learning of a good translation grammar . Their method for accomplishing this is through using existing natural language processing applications on the translated data and using these to come up with a numeric value indicating degree of `` Englishness '' . The measures they utilized included syntactic indicators such as word n-grams , number of edges in the parse ( both Collins and Apple Pie parser were used ) , log probability of the parse , execution of the parse , overall score of the parse , etc . Semantic criteria were based primarily on WordNet and incorporated the average minimum hyponym path length , path found ratio , percent of words with sense in WordNet . Other semantic criteria utilized mutual information measures . Two problems can be found with their approach . The first is that the data was drawn from dictionaries . Usage examples in dictionaries , while they provide great information , are not necessarily representative of typical language use . In fact , they tend to highlight unusual usage patterns or cases . Second , and more relevant to our purposes , is that they were looking at the glass as half-full instead of half-empty . We believe that our results will show that measuring intelligibility is not nearly as useful as finding a lack of intelligibility . This is not new in MT evaluation -as numerous approaches have been suggested to identify translation errors , such as [ 1 , 6 ] . In this instance , however , we are not counting errors to come up with a intelligibility score as much as finding out how quickly the intelligibility can be measured . Additionally , we are looking to a field where the essence of scoring is looking at error cases , that of language learning . The basic part of scoring learner language ( particularly second language acquisition and English as a second language ) consists of identifying likely errors and understanding the cause of them . From these , diagnostic models of language learning can be built and used to effectively remediate learner errors , [ 3 ] provide an excellent example of this . Furthermore , language learner testing seeks to measure the student 's ability to produce language which is fluent ( intelligible ) and correct ( adequate or informative ) . These are the same criteria typically used to measure MT system capability .1 In looking at different second language acquisition ( SLA ) testing paradigms , one experiment stands out as a useful starting point for our purposes . One experiment in particular serves as the model for this investigation . In their test of language teachers , Meara and Babi [ 3 ] looked at assessors making a native speaker ( L1 ) / language learner ( L2 ) distinction in written essays .2 They showed the assessors essays one word at a time and counted the number of words it took to make the distinction . They found that assessors could accurately attribute L1 texts 83.9 % of the time and L2 texts 87.2 % of the time for 180 texts and 18 assessors . Additionally , they found that assessors could make the L1/L2 distinction in less than 100 words . They also learned that it took longer to confirm that an essay was a native speaker 's than a language learner 's . It took , on average , 53.9 words to recognize an L1 text and only 36.7 words to accurately distinguish an L2 text . While their purpose was to rate the language assessment process , the results are intriguing from an MT perspective . They attribute the fact that L2 took less words to identify to the fact that L1 writing `` can only be identified negatively by the absence of errors , or the absence of awkward writing . '' While they could not readily select features , lexical or syntactic , on which evaluators consistently made their evaluation , they hypothesize that there is a `` tolerance threshold '' for low quality writing . In essence , once the pain threshold had been reached through errors , missteps or inconsistencies , then the assessor could confidently make the assessment . It is this finding that we use to disagree with Jones and Rusk [ 2 ] basic premise . Instead of looking for what the MT system got right , it is more fruitful to analyze what the MT system failed to capture , from an intelligibility standpoint . This kind of diagnostic is more difficult , as we will discuss later . We take this as the starting point for looking at assessing the intelligibility of MT output . The question to be answered is does this apply to distinguishing between expert translation and MT output ? This paper reports on an experiment to answer this question . We believe that human assessors key off of specific error types and that an analysis of the results of the experiment will enable us to do a program which automatically gets these . We started with publicly available data which was developed during the 1994 DARPA Machine Translation Evaluations [ 8 ] , focusing on the Spanish language evaluation first . They may be obtained at : http : //ursula.georgetown.edu . 3 We selected the first 50 translations from each system and from the reference translation . We extracted the first portion of each translation ( from 98 to 140 words as determined by sentence boundaries ) . In addition , we removed headlines , as we felt these served as distracters . Participants were recruited through the author 's workplace , through the author 's neighborhood and a nearby daycare center . Most were computer professionals and some were familiar with MT development or use . Each subject was given a set of six extracts -a mix of different machine and human translations . The participants were told to read line by line until they were able to make a distinction between the possible authors of the text -a human translator or a machine translator . The first twenty-five test subjects were given no information about the expertise of the human translator . The second twenty-five test subjects were told that the human translator was an expert . They were given up to three minutes per text , although they frequently required much less time . Finally , they were asked to circle the word at which they made their distinction . Figure 1 shows a sample text . The general secretary of the UN , Butros Butros-Ghali , was pronounced on Wednesday in favor of a solution `` more properly Haitian '' resulting of a `` commitment '' negotiated between the parts , if the international sanctions against Haiti continue being ineffectual to restore the democracy in that country . While United States multiplied the last days the threats of an intervention to fight to compel to the golpistas to abandon the power , Butros Ghali estimated in a directed report on Wednesday to the general Assembly of the UN that a solution of the Haitian crisis only it will be able be obtained `` with a commitment , based on constructive and consented grants '' by the parts . Our first question is does this kind of test apply to distinguishing between expert translation and MT output ? The answer is yes . Subjects were able to distinguish MT output from human translations 88.4 % of the time , overall . This determination is more straightforward for readers than the native/non-native speaker distinction . There was a degree of variation on a persystem basis , as captured in Table 1 . Additionally , as presented in Table 2 , the number of words to determine that a text was human was nearly twice the closest system . 4 The second question is does this ability correlate with the intelligibility scores applied by human raters ? One way to look at the answer to this is to view the fact that the more intelligible a system output , the harder it is to distinguish from human output . So , systems which have lower scores for human judgment should have higher intelligibility scores . Table 3 presents the scores with the fluency scores as judged by human assessors . Indeed , the systems with the lowest fluency scores were most easily attributed . The system with the best fluency score was also the one most confused . Individual articles in the test sample will need to be evaluated statistically before a definite correlation can be determined , but the results are encouraging . 4 For those texts where the participants failed to mark a specific spot , the length of the text was included in the average . The final question is are there characteristics of the MT output which enable the decision to be made quickly ? The initial results lead us to believe that it is so . Not translated words ( non proper nouns ) were generally immediate clues as to the fact that a system produced the results . Other factors included : incorrect pronoun translation ; incorrect preposition translation ; incorrect punctuation . A more detailed breakdown of the selection criteria and the errors occurring before the selected word is currently in process . An area for further analysis is that of the looking at the details of the post-test interviews . These have consistently shown that the deciders utilized error spotting , although the types and sensitivities of the errors differed from subject to subject . Some errors were serious enough to make the choice obvious where others had to occur more than once to push the decision above a threshold . Extending this to a new language pair is also desirable as a language more divergent than Spanish from English might give different ( and possibly even stronger ) results . Finally , we are working on constructing a program , using principles from Computer Assisted Language Learning ( CALL ) program design , which is aimed to duplicate the ability to assess human versus system texts . My thanks goes to all test subjects and Ken Samuel for review . The discussion of whether or not MT output should be compared to human translation output is grist for other papers and other forums.2 In their experiment , they were examining students learning Spanish as a second language . Data has since been moved to a new location .

examples/N18-3011_body.txt DELETED Viewed

@@ -1 +0,0 @@

- The goal of this work is to facilitate algorithmic discovery in the scientific literature . Despite notable advances in scientific search engines , data mining and digital libraries ( e.g. , Wu et al. , 2014 ) , researchers remain unable to answer simple questions such as : What is the percentage of female subjects in depression clinical trials ? Which of my co-authors published one or more papers on coreference resolution ? Which papers discuss the effects of Ranibizumab on the Retina ? In this paper , we focus on the problem of extracting structured data from scientific documents , which can later be used in natural language interfaces ( e.g. , Iyer et al. , 2017 ) or to improve ranking of results in academic search ( e.g. , Xiong et al. , Figure 1 : Part of the literature graph . 2017 ) . We describe methods used in a scalable deployed production system for extracting structured information from scientific documents into the literature graph ( see Fig . 1 ) . The literature graph is a directed property graph which summarizes key information in the literature and can be used to answer the queries mentioned earlier as well as more complex queries . For example , in order to compute the Erdős number of an author X , the graph can be queried to find the number of nodes on the shortest undirected path between author X and Paul Erdős such that all edges on the path are labeled `` authored '' . We reduce literature graph construction into familiar NLP tasks such as sequence labeling , entity linking and relation extraction , and address some of the impractical assumptions commonly made in the standard formulations of these tasks . For example , most research on named entity recognition tasks report results on large labeled datasets such as CoNLL-2003 and ACE-2005 ( e.g. , Lample et al. , 2016 , and assume that entity types in the test set match those labeled in the training set ( including work on domain adaptation , e.g. , Daumé , 2007 ) . These assumptions , while useful for developing and benchmarking new methods , are unrealistic for many domains and applications . The paper also serves as an overview of the approach we adopt at www.semanticscholar.org in a step towards more intelligent academic search engines ( Etzioni , 2011 ) . In the next section , we start by describing our symbolic representation of the literature . Then , we discuss how we extract metadata associated with a paper such as authors and references , then how we extract the entities mentioned in paper text . Before we conclude , we briefly describe other research challenges we are actively working on in order to improve the quality of the literature graph . The literature graph is a property graph with directed edges . Unlike Resource Description Framework ( RDF ) graphs , nodes and edges in property graphs have an internal structure which is more suitable for representing complex data types such as papers and entities . In this section , we describe the attributes associated with nodes and edges of different types in the literature graph . Papers . We obtain metadata and PDF files of papers via partnerships with publishers ( e.g. , Springer , Nature ) , catalogs ( e.g. , DBLP , MED-LINE ) , pre-publishing services ( e.g. , arXiv , bioRxive ) , as well as web-crawling . Paper nodes are associated with a set of attributes such as 'title ' , 'abstract ' , 'full text ' , 'venues ' and 'publication year ' . While some of the paper sources provide these attributes as metadata , it is often necessary to extract them from the paper PDF ( details in §3 ) . We deterministically remove duplicate papers based on string similarity of their metadata , resulting in 37M unique paper nodes . Papers in the literature graph cover a variety of scientific disciplines , including computer science , molecular biology , microbiology and neuroscience . Authors . Each node of this type represents a unique author , with attributes such as 'first name ' and 'last name ' . The literature graph has 12M nodes of this type . Entities . Each node of this type represents a unique scientific concept discussed in the literature , with attributes such as 'canonical name ' , 'aliases ' and 'description ' . Our literature graph has 0.4M nodes of this type . We describe how we populate entity nodes in §4.3 . Entity mentions . Each node of this type represents a textual reference of an entity in one of the papers , with attributes such as 'mention text ' , 'context ' , and 'confidence ' . We describe how we populate the 237M mentions in the literature graph in §4.1 . Citations . We instantiate a directed citation edge from paper nodes p 1 ! p 2 for each p 2 referenced in p 1 . Citation edges have attributes such as 'from paper id ' , 'to paper id ' and 'contexts ' ( the textual contexts where p 2 is referenced in p 1 ) . While some of the paper sources provide these attributes as metadata , it is often necessary to extract them from the paper PDF as detailed in §3 . Authorship . We instantiate a directed authorship edge between an author node and a paper node a ! p for each author of that paper . Entity linking edges . We instantiate a directed edge from an extracted entity mention node to the entity it refers to . Mention-mention relations . We instantiate a directed edge between a pair of mentions in the same sentential context if the textual relation extraction model predicts one of a predefined list of relation types between them in a sentential context . 1 We encode a symmetric relation between m 1 and m 2 as two directed edges m 1 ! m 2 and m 2 ! m 1 . Entity-entity relations . While mentionmention edges represent relations between mentions in a particular context , entity-entity edges represent relations between abstract entities . These relations may be imported from an existing knowledge base ( KB ) or inferred from other edges in the graph . In the previous section , we described the overall structure of the literature graph . Next , we discuss how we populate paper nodes , author nodes , authorship edges , and citation edges . Although some publishers provide sufficient metadata about their papers , many papers are provided with incomplete metadata . Also , papers obtained via web-crawling are not associated with any metadata . To fill in this gap , we built the Sci-enceParse system to predict structured data from the raw PDFs using recurrent neural networks ( RNNs ) . 2 For each paper , the system extracts the paper title , list of authors , and list of references ; each reference consists of a title , a list of authors , a venue , and a year . Preparing the input layer . We split each PDF into individual pages , and feed each page to Apache 's PDFBox library 3 to convert it into a sequence of tokens , where each token has features , e.g. , 'text ' , 'font size ' , 'space width ' , 'position on the page ' . We normalize the token-level features before feeding them as inputs to the model . For each of the 'font size ' and 'space width ' features , we compute three normalized values ( with respect to current page , current document , and the whole training corpus ) , each value ranging between -0.5 to +0.5 . The token 's 'position on the page ' is given in XY coordinate points . We scale the values linearly to range from . 0:5 ; 0:5/ at the top-left corner of the page to .0:5 ; 0:5/ at the bottom-right corner . In order to capture case information , we add seven numeric features to the input representation of each token : whether the first/second letter is uppercase/lowercase , the fraction of uppercase/lowercase letters and the fraction of digits . To help the model make correct predictions for metadata which tend to appear at the beginning ( e.g. , titles and authors ) or at the end of papers ( e.g. , references ) , we provide the current page number as two discrete variables ( relative to the beginning and end of the PDF file ) with values 0 , 1 and 2+ . These features are repeated for each token on the same page . For the k-th token in the sequence , we compute the input representation i k by concatenating the numeric features , an embedding of the 'font size ' , and the word embedding of the lowercased token . Word embeddings are initialized with GloVe ( Pennington et al. , 2014 ) . Model . The input token representations are passed through one fully-connected layer and then g ! k D LSTM.Wi k ; g ! k 1 / ; g k D OEg ! k I g k ; h ! k D LSTM.g k ; h ! k 1 / ; h k D OEh ! k I g k where W is a weight matrix , g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction . Following Collobert et al . 2011 , we feed the output of the second layer h k into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights ( often described as a conditional random field layer when used in neural architectures ) to account for dependencies between labels . Training . The ScienceParse system is trained on a snapshot of the data at PubMed Central . It consists of 1.4M PDFs and their associated metadata , which specify the correct titles , authors , and bibliographies . We use a heuristic labeling process that finds the strings from the metadata in the tokenized PDFs to produce labeled tokens . This labeling process succeeds for 76 % of the documents . The remaining documents are not used in the training process . During training , we only use pages which have at least one token with a label that is not `` none '' . Decoding . At test time , we use Viterbi decoding to find the most likely global sequence , with no further constraints . To get the title , we use the longest continuous sequence of tokens with the `` title '' label . Since there can be multiple authors , we use all continuous sequences of tokens with the `` author '' label as authors , but require that all authors of a paper are mentioned on the same page . If the author labels are predicted in multiple pages , we use the one with the largest number of authors . Results . We run our final tests on a held-out set from PubMed Central , consisting of about 54K documents . The results are detailed in Table 1 . We use a conservative evaluation where an instance is correct if it exactly matches the gold annotation , with no credit for partial matching . To give an example for the type of errors our model makes , consider the paper ( Wang et al. , 2013 ) titled `` Clinical review : Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a systematic review and metaanalysis . '' The title we extract for this paper omits the first part `` Clinical review : '' . This is likely to be a result of the pattern `` Foo : Bar Baz '' appearing in many training examples with only `` Bar Baz '' labeled as the title . In the previous section , we described how we populate the backbone of the literature graph , i.e. , paper nodes , author nodes and citation edges . Next , we discuss how we populate mentions and entities in the literature graph using entity extraction and linking on the paper text . In order to focus on more salient entities in a given paper , we only use the title and abstract . We experiment with three approaches for entity extraction and linking : I . Statistical : uses one or more statistical models for predicting mention spans , then uses another statistical model to link mentions to candidate entities in a KB . II . Hybrid : defines a small number of handengineered , deterministic rules for string-based matching of the input text to candidate entities in the KB , then uses a statistical model to disambiguate the mentions . 4 III . Off-the-shelf : uses existing libraries , namely ( Ferragina and Scaiella , 2010 , TagMe ) 5 and ( Demner-Fushman et al. , 2017 , MetaMap Lite ) 6 , with minimal post-processing to extract and link entities to the KB . Table 2 : Document-level evaluation of three approaches in two scientific areas : computer science ( CS ) and biomedical ( Bio ) . We evaluate the performance of each approach in two broad scientific areas : computer science ( CS ) and biomedical research ( Bio ) . For each unique ( paper ID , entity ID ) pair predicted by one of the approaches , we ask human annotators to label each mention extracted for this entity in the paper . We use CrowdFlower to manage human annotations and only include instances where three or more annotators agree on the label . If one or more of the entity mentions in that paper is judged to be correct , the pair ( paper ID , entity ID ) counts as one correct instance . Otherwise , it counts as an incorrect instance . We report 'yield ' in lieu of 'recall ' due to the difficulty of doing a scalable comprehensive annotation . Table 2 shows the results based on 500 papers using v1.1.2 of our entity extraction and linking components . In both domains , the statistical approach gives the highest precision and the lowest yield . The hybrid approach consistently gives the highest yield , but sacrifices precision . The TagMe off-the-shelf library used for the CS domain gives surprisingly good results , with precision within 1 point from the statistical models . However , the MetaMap Lite off-the-shelf library we used for the biomedical domain suffered a huge loss in precision . Our error analysis showed that each of the approaches is able to predict entities not predicted by the other approaches so we decided to pool their outputs in our deployed system , which gives significantly higher yield than any individual approach while maintaining reasonably high precision . Given the token sequence t 1 ; : : : ; t N in a sentence , we need to identify spans which correspond to entity mentions . We use the BILOU scheme to encode labels at the token level . Unlike most formulations of named entity recognition problems ( NER ) , we do not identify the entity type ( e.g. , protein , drug , chemical , disease ) for each mention since the output mentions are further grounded in a KB with further information about the entity ( including its type ) , using an entity linking module . Model . First , we construct the token embedding x k D OEc k I w k for each token t k in the input sequence , where c k is a character-based representation computed using a convolutional neural network ( CNN ) with filter of size 3 characters , and w k are learned word embeddings initialized with the GloVe embeddings ( Pennington et al. , 2014 ) . We also compute context-sensitive word embeddings , denoted as lm k D OElm ! k I lm k , by concatenating the projected outputs of forward and backward recurrent neural network language models ( RNN-LM ) at position k. The language model ( LM ) for each direction is trained independently and consists of a single layer long short-term memory ( LSTM ) network followed by a linear project layer . While training the LM parameters , lm ! k is used to predict t kC1 and lm k is used to predict t k 1 . We fix the LM parameters during training of the entity extraction model . See and for more details . Given the x k and lm k embeddings for each token k 2 f1 ; : : : ; N g , we use a two-layer bidirectional LSTM to encode the sequence with x k and lm k feeding into the first and second layer , respectively . That is , g ! k D LSTM.x k ; g ! k 1 / ; g k D OEg ! k I g k ; h ! k D LSTM.OEg k I lm k ; h ! k 1 / ; h k D OEh ! k I h k ; where g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction . Similar to the model described in §3 , we feed the output of the second LSTM into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights to account for dependencies between labels . Results . We use the standard data splits of the SemEval-2017 Task 10 on entity ( and relation ) extraction from scientific papers ( Augenstein et al. , 2017 ) . Table 3 compares three variants of our entity extraction model . The first line omits the LM embeddings lm k , while the second line is the full model ( including LM embeddings ) showing a large improvement of 4.2 F1 points . The third line shows that creating an ensemble of 15 models further improves the results by 1.1 F1 points . Model instances . In the deployed system , we use three instances of the entity extraction model Description F1 Without LM 49.9 With LM 54.1 Avg . of 15 models with LM 55.2 Table 3 : Results of the entity extraction model on the development set of SemEval-2017 task 10. with a similar architecture , but trained on different datasets . Two instances are trained on the BC5CDR ( Li et al. , 2016 ) and the CHEMDNER datasets ( Krallinger et al. , 2015 ) to extract key entity mentions in the biomedical domain such as diseases , drugs and chemical compounds . The third instance is trained on mention labels induced from Wikipedia articles in the computer science domain . The output of all model instances are pooled together and combined with the rule-based entity extraction module , then fed into the entity linking model ( described below ) . In this section , we describe the construction of entity nodes and entity-entity edges . Unlike other knowledge extraction systems such as the Never-Ending Language Learner ( NELL ) 7 and OpenIE 4 , 8 we use existing knowledge bases ( KBs ) of entities to reduce the burden of identifying coherent concepts . Grounding the entity mentions in a manually-curated KB also increases user confidence in automated predictions . We use two KBs : UMLS : The UMLS metathesaurus integrates information about concepts in specialized ontologies in several biomedical domains , and is funded by the U.S. National Library of Medicine . DBpedia : DBpedia provides access to structured information in Wikipedia . Rather than including all Wikipedia pages , we used a short list of Wikipedia categories about CS and included all pages up to depth four in their trees in order to exclude irrelevant entities , e.g. , `` Lord of the Rings '' in DBpedia . Given a text span s identified by the entity extraction model in §4.2 ( or with heuristics ) and a reference KB , the goal of the entity linking model is to associate the span with the entity it refers to . A span and its surrounding words are collectively referred to as a mention . We first identify a set of candidate entities that a given mention may refer to . Then , we rank the candidate entities based on a score computed using a neural model trained on labeled data . For example , given the string `` . . . database of facts , an ILP system will . . . `` , the entity extraction model identifies the span `` ILP '' as a possible entity and the entity linking model associates it with `` Inductive_Logic_Programming '' as the referent entity ( from among other candidates like `` Integer_Linear_Programming '' or `` Instruction-level_Parallelism '' ) . Datasets . We used two datasets : i ) a biomedical dataset formed by combining MSH ( Jimeno-Yepes et al. , 2011 ) and BC5CDR ( Li et al. , 2016 ) with UMLS as the reference KB , and ii ) a CS dataset we curated using Wikipedia articles about CS concepts with DBpedia as the reference KB . Candidate selection . In a preprocessing step , we build an index which maps any token used in a labeled mention or an entity name in the KB to associated entity IDs , along with the frequency this token is associated with that entity . This is similar to the index used in previous entity linking systems ( e.g. , Bhagavatula et al. , 2015 ) to estimate the probability that a given mention refers to an entity . At train and test time , we use this index to find candidate entities for a given mention by looking up the tokens in the mention . This method also serves as our baseline in Table 4 by selecting the entity with the highest frequency for a given mention . Scoring candidates . Given a mention ( m ) and a candidate entity ( e ) , the neural model constructs a vector encoding of the mention and the entity . We encode the mention and entity using the functions f and g , respectively , as follows : f.m/ D OEv m.name I avg.v m.lc ; v m.rc / ; g.e/ D OEv e.name I v e.def ; where m.surface , m.lc and m.rc are the mention 's surface form , left and right contexts , and e.name and e.def are the candidate entity 's name and definition , respectively . v text is a bag-of-words sum encoder for text . We use the same encoder for the mention surface form and the candidate name , and another encoder for the mention contexts and entity definition . Additionally , we include numerical features to estimate the confidence of a candidate entity based on the statistics collected in the index described Table 4 : The Bag of Concepts F1 score of the baseline and neural model on the two curated datasets . earlier . We compute two scores based on the word overlap of ( i ) mention 's context and candidate 's definition and ( ii ) mention 's surface span and the candidate entity 's name . Finally , we feed the concatenation of the cosine similarity between f.m/ and g.e/ and the intersection-based scores into an affine transformation followed by a sigmoid nonlinearity to compute the final score for the pair ( m , e ) . Results . We use the Bag of Concepts F1 metric ( Ling et al. , 2015 ) for comparison . Table 4 compares the performance of the most-frequent-entity baseline and our neural model described above . In the previous sections , we discussed how we construct the main components of the literature graph . In this section , we briefly describe several other related challenges we are actively working on . Author disambiguation . Despite initiatives to have global author IDs ORCID and ResearcherID , most publishers provide author information as names ( e.g. , arXiv ) . However , author names can not be used as a unique identifier since several people often share the same name . Moreover , different venues and sources use different conventions in reporting the author names , e.g. , `` first initial , last name '' vs. `` last name , first name '' . Inspired by Culotta et al . ( 2007 ) , we train a supervised binary classifier for merging pairs of author instances and use it to incrementally create author clusters . We only consider merging two author instances if they have the same last name and share the first initial . If the first name is spelled out ( rather than abbreviated ) in both author instances , we also require that the first name matches . Ontology matching . Popular concepts are often represented in multiple KBs . For example , the concept of `` artificial neural networks '' is represented as entity ID D016571 in the MESH ontology , and represented as page ID '21523 ' in DBpedia . Ontology matching is the problem of identifying semantically-equivalent entities across KBs or ontologies . 9 Limited KB coverage . The convenience of grounding entities in a hand-curated KB comes at the cost of limited coverage . Introduction of new concepts and relations in the scientific literature occurs at a faster pace than KB curation , resulting in a large gap in KB coverage of scientific concepts . In order to close this gap , we need to develop models which can predict textual relations as well as detailed concept descriptions in scientific papers . For the same reasons , we also need to augment the relations imported from the KB with relations extracted from text . Our approach to address both entity and relation coverage is based on distant supervision ( Mintz et al. , 2009 ) . In short , we train two models for identifying entity definitions and relations expressed in natural language in scientific documents , and automatically generate labeled data for training these models using known definitions and relations in the KB . We note that the literature graph currently lacks coverage for important entity types ( e.g. , affiliations ) and domains ( e.g. , physics ) . Covering affiliations requires small modifications to the metadata extraction model followed by an algorithm for matching author names with their affiliations . In order to cover additional scientific domains , more agreements need to be signed with publishers . Figure and table extraction . Non-textual components such as charts , diagrams and tables provide key information in many scientific documents , but the lack of large labeled datasets has impeded the development of data-driven methods for scientific figure extraction . In Siegel et al . ( 2018 ) , we induced high-quality training labels for the task of figure extraction in a large number of scientific documents , with no human intervention . To accomplish this we leveraged the auxiliary data provided in two large web collections of scientific documents ( arXiv and PubMed ) to locate figures and their associated captions in the rasterized PDF . We use the resulting dataset to train a deep neural network for end-to-end figure detection , yielding a model that can be more easily extended to new domains compared to previous work . Understanding and predicting citations . The citation edges in the literature graph provide a wealth of information ( e.g. , at what rate a paper is being cited and whether it is accelerating ) , and opens the door for further research to better understand and predict citations . For example , in order to allow users to better understand what impact a paper had and effectively navigate its citations , we experimented with methods for classifying a citation as important or incidental , as well as more finegrained classes ( Valenzuela et al. , 2015 ) . The citation information also enables us to develop models for estimating the potential of a paper or an author . In Weihs and Etzioni ( 2017 ) , we predict citationbased metrics such as an author 's h-index and the citation rate of a paper in the future . Also related is the problem of predicting which papers should be cited in a given draft ( Bhagavatula et al. , 2018 ) , which can help improve the quality of a paper draft before it is submitted for peer review , or used to supplement the list of references after a paper is published . In this paper , we discuss the construction of a graph , providing a symbolic representation of the scientific literature . We describe deployed models for identifying authors , references and entities in the paper text , and provide experimental results to evaluate the performance of each model . Three research directions follow from this work and other similar projects , e.g. , Hahn-Powell et al . ( 2017 ) ; Wu et al . ( 2014 ) : i ) improving quality and enriching content of the literature graph ( e.g. , ontology matching and knowledge base population ) . ii ) aggregating domain-specific extractions across many papers to enable a better understanding of the literature as a whole ( e.g. , identifying demographic biases in clinical trial participants and summarizing empirical results on important tasks ) . iii ) exploring the literature via natural language interfaces . In order to help future research efforts , we make the following resources publicly available : metadata for over 20 million papers , 10 meaningful citations dataset , 11 models for figure and table extraction , 12 models for predicting citations in a paper draft 13 and models for extracting paper metadata , 14 among other resources . 15 Due to space constraints , we opted not to discuss our relation extraction models in this draft . The ScienceParse libraries can be found at http : // allenai.org/software/.3 https : //pdfbox.apache.org We also experimented with a `` pure '' rules-based approach which disambiguates deterministically but the hybrid approach consistently gave better results.5 The TagMe APIs are described at https : //sobigdata . d4science.org/web/tagme/tagme-help6 We use v3.4 ( L0 ) of MetaMap Lite , available at https : //metamap.nlm.nih.gov/MetaMapLite.shtml http : //rtw.ml.cmu.edu/rtw/ 8 https : //github.com/allenai/ openie-standalone Variants of this problem are also known as deduplication or record linkage .

examples/N18-3011_ref.txt DELETED Viewed

@@ -1,27 +0,0 @@
-Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).
-Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).
-Chandra Bhagavatula, Sergey Feldman, Russell Power, and Waleed Ammar. 2018. Content-based citation recommendation. In NAACL.
-Chandra Bhagavatula, Thanapon Noraset, and Doug Downey. 2015. TabEL: entity linking in web tables. In ISWC.
-Ronan Collobert, Jason Weston, Léon Bottou, Michael Karlen, Koray Kavukcuoglu, and Pavel P. Kuksa. 2011. Natural language processing (almost) from scratch. In JMLR.
-Aron Culotta, Pallika Kanani, Robert Hall, Michael Wick, and Andrew D. McCallum. 2007. Author disambiguation using error-driven machine learning with a ranking loss function. In IIWeb Workshop.
-Hal Daumé. 2007. Frustratingly easy domain adapta- tion. In ACL.
-Dina Demner-Fushman, Willie J. Rogers, and Alan R. Aronson. 2017. MetaMap Lite: an evaluation of a new Java implementation of MetaMap. In JAMIA.
-Oren Etzioni. 2011. Search needs a shake-up. Nature 476 7358:25-6.
-Paolo Ferragina and Ugo Scaiella. 2010. TAGME: on-the-fly annotation of short text fragments (by wikipedia entities). In CIKM.
-Gus Hahn-Powell, Marco Antonio Valenzuela- Escarcega, and Mihai Surdeanu. 2017. Swanson linking revisited: Accelerating literature-based dis- covery across domains using a conceptual influence graph. In ACL.
-Sepp Hochreiter and Jürgen Schmidhuber. 1997. Long short-term memory. Neural computation .
-Srinivasan Iyer, Ioannis Konstas, Alvin Cheung, Jayant Krishnamurthy, and Luke S. Zettlemoyer. 2017. Learning a neural semantic parser from user feed- back. In ACL.
-Antonio J. Jimeno-Yepes, Bridget T. McInnes, and Alan R. Aronson. 2011. Exploiting mesh indexing in medline to generate a data set for word sense dis- ambiguation. BMC bioinformatics 12(1):223.
-Martin Krallinger, Florian Leitner, Obdulia Rabal, Miguel Vazquez, Julen Oyarzabal, and Alfonso Va- lencia. 2015. CHEMDNER: The drugs and chemi- cal names extraction challenge. In J. Cheminformat- ics.
-Guillaume Lample, Miguel Ballesteros, Sandeep K Subramanian, Kazuya Kawakami, and Chris Dyer. 2016. Neural architectures for named entity recog- nition. In HLT-NAACL.
-Jiao Li, Yueping Sun, Robin J. Johnson, Daniela Sci- aky, Chih-Hsuan Wei, Robert Leaman, Allan Peter Davis, Carolyn J. Mattingly, Thomas C. Wiegers, and Zhiyong Lu. 2016. Biocreative v cdr task cor- pus: a resource for chemical disease relation extrac- tion. Database : the journal of biological databases and curation 2016.
-Xiao Ling, Sameer Singh, and Daniel S. Weld. 2015. Design challenges for entity linking. Transactions of the Association for Computational Linguistics 3:315-328.
-Mike Mintz, Steven Bills, Rion Snow, and Daniel Ju- rafsky. 2009. Distant supervision for relation extrac- tion without labeled data. In ACL.
-Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global vectors for word rep- resentation. In EMNLP.
-Matthew E. Peters, Waleed Ammar, Chandra Bhagavat- ula, and Russell Power. 2017. Semi-supervised se- quence tagging with bidirectional language models. In ACL.
-Noah Siegel, Nicholas Lourie, Russell Power, and Waleed Ammar. 2018. Extracting scientific figures with distantly supervised neural networks. In JCDL.
-Marco Valenzuela, Vu Ha, and Oren Etzioni. 2015. Identifying meaningful citations. In AAAI Workshop (Scholarly Big Data).
-Xiang Wang, Yan Dong, Xiang qian Qi, Yi-Ming Li, Cheng-Guang Huang, and Lijun Hou. 2013. Clin- ical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a system- atic review and meta-analysis. In Critical care.
-Luca Weihs and Oren Etzioni. 2017. Learning to pre- dict citation-based impact measures. In JCDL.
-Jian Wu, Kyle Williams, Hung-Hsuan Chen, Madian Khabsa, Cornelia Caragea, Alexander Ororbia, Dou- glas Jordan, and C. Lee Giles. 2014. CiteSeerX: AI in a digital library search engine. In AAAI.
-Chenyan Xiong, Russell Power, and Jamie Callan. 2017. Explicit semantic ranking for academic search via knowledge graph embedding. In WWW.

reference_string_parsing.py DELETED Viewed

@@ -1,36 +0,0 @@
-from typing import List, Tuple
-import torch
-from SciAssist import ReferenceStringParsing
-device = "gpu" if torch.cuda.is_available() else "cpu"
-rsp_pipeline = ReferenceStringParsing(os_name="nt", device=device)
-def rsp_for_str(input, dehyphen=False) -> List[Tuple[str, str]]:
-    results = rsp_pipeline.predict(input, type="str", dehyphen=dehyphen)
-    output = []
-    for res in results:
-        for token, tag in zip(res["tokens"], res["tags"]):
-            output.append((token, tag))
-        output.append(("\n\n", None))
-    return output
-def rsp_for_file(input, dehyphen=False) -> List[Tuple[str, str]]:
-    if input == None:
-        return None
-    filename = input.name
-    # Identify the format of input and parse reference strings
-    if filename[-4:] == ".txt":
-        results = rsp_pipeline.predict(filename, type="txt", dehyphen=dehyphen, save_results=False)
-    elif filename[-4:] == ".pdf":
-        results = rsp_pipeline.predict(filename, dehyphen=dehyphen, save_results=False)
-    else:
-        return [("File Format Error !", None)]
-    # Prepare for the input gradio.HighlightedText accepts.
-    output = []
-    for res in results:
-        for token, tag in zip(res["tokens"], res["tags"]):
-            output.append((token, tag))
-        output.append(("\n\n", None))
-    return output

requirements.txt DELETED Viewed

@@ -1,6 +0,0 @@
-pip==23.2.1
-torch==1.12.0
-SciAssist==0.1.4
-nltk~=3.7
-pytest
-huggingface-hub==0.27.1

scibert-uncased.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:588d24803c6e9d61bc0fd2afc566f1af0b5a83e71867579e4590793b78316bfc
-size 439802669