Spaces:

NbAiLab
/

language-identification

Running

File size: 3,816 Bytes

44d8b8c
cbd7b15
fe1893c
cbd7b15
 
 
fe1893c
8bfc488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe1893c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07568d4
 
fe1893c
 
 
 
 
 
 
 
 
 
 
 
 
 
cbd7b15
 
528f383
 
cbd7b15
8bfc488
 
07568d4
8bfc488
98f3f94
a17d6f7
98f3f94
9d37ac1
 
70dfd01
a17d6f7
98f3f94
 
cbd7b15

from typing import Optional, List, Set, Union, Tuple
from huggingface_hub import hf_hub_download
import gradio as gr
import fasttext

model = fasttext.load_model(hf_hub_download("NbAiLab/nb-nordic-lid", "model.bin"))
model_labels = set(label[-3:] for label in model.get_labels())
language_dict = {
    'dan': 'Danish',
    'eng': 'English',
    'fao': 'Faroese',
    'fin': 'Finnish',
    'isl': 'Icelandic',
    'nno': 'Norwegian Nynorsk',
    'nob': 'Norwegian Bokmål',
    'sma': 'Southern Sami',
    'sme': 'Northern Sami',
    'smj': 'Lule Sami',
    'smn': 'Inari Sami',
    'sms': 'Skolt Sami',
    'swe': 'Swedish',
    'und': 'Undetermined',
}

def detect_lang(
    text: str,
    langs: Optional[Union[List, Set]]=None,
    threshold: float=-1.0,
    return_proba: bool=False
) -> Union[str, Tuple[str, float]]:
    """
    This function takes in a text string and optional arguments for a list or
    set of languages to detect, a threshold for minimum probability of language
    detection, and a boolean for returning the probability of detected language.
    It uses a pre-defined model to predict the language of the text and returns
    the detected ISO-639-3 language code as a string. If the return_proba
    argument is set to True, it will also return a tuple with the language code
    and the probability of detection. If no language is detected, it will
    return "und" as the language code.

    Args:
    - text (str): The text to detect the language of.
    - langs (List or Set, optional): The list or set of languages to detect in 
        the text. Defaults to all languages in the model's labels.
    - threshold (float, optional): The minimum probability for a language to be
        considered detected. Defaults to `-1.0`.
    - return_proba (bool, optional): Whether to return the language code and
        probability of detection as a tuple. Defaults to `False`.

    Returns:
    str or Tuple[str, float]: The detected language code as a string, or a
        tuple with the language code and probability of detection if
        return_proba is set to True.
    """
    if len(text.split()) < 4:
        return [("und", 1.0)] if return_proba else "und" 
    if langs:
        langs = set(langs)
    else:
        langs = model_labels
    raw_prediction = model.predict(text, threshold=threshold, k=-1)
    predictions = [
        (label[-3:], min(probability, 1.0))
        for label, probability in zip(*raw_prediction)
        if label[-3:] in langs
    ]
    if not predictions:
        return [("und", 1.0)] if return_proba else "und"
    else:
        return predictions if return_proba else predictions[0][0]


def identify(text, threshold):
    return {language_dict[lang]: proba for lang, proba in detect_lang(text.replace("\n", " "), threshold=threshold / 100.0, return_proba=True)}

iface = gr.Interface(
    title="NB Nordic Language Identification",
    description="""This demo uses the [NB-Nordic-LID](https://huggingface.co/NbAiLab/nb-nordic-lid) model to classify a given text into one of the 12 Nordic languages supported. <b>At least 3 or 4 words are needed to identify the language.</b>""",
    fn=identify,
    inputs=[gr.Textbox(label="Text to identify language for"), gr.Slider(0, 100, value=80, step=1, label="Probability threshold (%)")], 
    outputs=gr.Label(label="Prediction"),
    examples=[
        ["Jeg heter Svein Arne", 80],
        ["Dán lágan li biejadusá dárogiela, rijkalasj unneplågogielaj ja dáro siejvvemgiela birra", 80],
        ["Skriftspråket har derfor helst brukt ord som kan førast attende til gammalnorsk der slike har funnest i levande talemål.", 80],
        ["Ođđadárogiela vuođđun leat leamaš Norgga suopmanat, ja dasto das eai leat nu olu dánskkagiel sánit go girjedárogielas.", 80],
    ]
)
iface.launch()