dsfsi-language-identification-spaces

Runtime error

App Files Files Community

ThapeloAndrewSindane commited on Sep 9, 2024

Commit

edd31b5

verified ·

1 Parent(s): 9da6542

GlotLID and OpenLID

Browse files

Adding GlotLID and OpenLID

Files changed (1) hide show

app.py +75 -48

app.py CHANGED Viewed

@@ -168,6 +168,9 @@ model_afriberta = load_model_pipeline('dsfsi/za-afriberta-lid', "model.bin")
 model_afroxlmr_base = load_model_pipeline('dsfsi/za-afro-xlmr-base-lid', "model.bin")
 model_afrolm        = load_model_pipeline('dsfsi/za-afrolm-lid', "model.bin")
 za_lid = load_model_pipeline('dsfsi/za-lid-bert', "model.bin")
 # @st.cache_resource
 def plot(label, prob):
@@ -250,8 +253,12 @@ def compute(sentences, version = 'v3'):
         model_choice = model_afrolm
     elif version == 'BERT':
         model_choice = za_lid
     else:
-         model_choice = [model_xlmr_large,model_serengeti, model_afriberta, model_afroxlmr_base, model_afrolm, za_lid]
     my_bar = st.progress(0, text=progress_text)
@@ -265,22 +272,70 @@ def compute(sentences, version = 'v3'):
                  all_models_pred = []
                  for model in model_choice:
                             output = model.predict(sent)
-                            output_label = output[index]['label']
-                            output_prob =  output[index]['score']
-                            output_label_language = output[index]['label']
-                            # output_label  = output[0][0].split('__')[-1].replace('_Hans', '_Hani').replace('_Hant', '_Hani')
-                            # output_prob = max(min(output[1][0], 1), 0)
-                            # output_label_language = output_label.split('_')[0]
-                            # # script control
-                            # if version in ['v3', 'v2', 'openlid-201', 'nllb-218'] and output_label_language!= 'zxx':
-                            #     main_script, all_scripts = get_script(sent)
-                            #     output_label_script = output_label.split('_')[1]
-                            #     if output_label_script not in all_scripts:
-                            #         output_label_script = main_script
-                            #         output_label = f"und_{output_label_script}"
-                            #         output_prob = 0
                             labels = labels + [output_label]
@@ -289,34 +344,6 @@ def compute(sentences, version = 'v3'):
                             my_bar.progress(
                                 min((index) / len(sentences), 1),
                                 text=progress_text,
-                            )
-        else:
-                output = model_choice.predict(sent)
-                output_label = output[index]['label']
-                output_prob =  output[index]['score']
-                output_label_language = output[index]['label']
-                # output_label  = output[0][0].split('__')[-1].replace('_Hans', '_Hani').replace('_Hant', '_Hani')
-                # output_prob = max(min(output[1][0], 1), 0)
-                # output_label_language = output_label.split('_')[0]
-                # # script control
-                # if version in ['v3', 'v2', 'openlid-201', 'nllb-218'] and output_label_language!= 'zxx':
-                #     main_script, all_scripts = get_script(sent)
-                #     output_label_script = output_label.split('_')[1]
-                #     if output_label_script not in all_scripts:
-                #         output_label_script = main_script
-                #         output_label = f"und_{output_label_script}"
-                #         output_prob = 0
-                labels = labels + [output_label]
-                probs = probs + [output_prob]
-                my_bar.progress(
-                    min((index) / len(sentences), 1),
-                    text=progress_text,
                 )
     my_bar.empty()
     return probs, labels
@@ -343,8 +370,8 @@ with tab1:
     version = st.radio(
         "Choose model",
-        ["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT", "All-Models"],
-        captions=["za-XLMR-Large", "za-Serengeti", "za-AfriBERTa", "za-Afro-XLMR-BASE", "za-AfroLM", "za-BERT", 'All-Models'],
         index = 4,
         key = 'version_tab1',
         horizontal = True
@@ -376,7 +403,7 @@ with tab1:
         # plot
         if version == "All-Models":
-               plot_multiples(["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT"], labels, probs)
         else:
                plot(label, prob)
@@ -385,8 +412,8 @@ with tab2:
     version = st.radio(
         "Choose model",
-        ["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT", "All-Models"],
-        captions=["za-XLMR-Large", "za-Serengeti", "za-AfriBERTa", "za-Afro-XLMR-BASE", "za-AfroLM", "za-BERT", "All-Models"],
         index = 4,
         key = 'version_tab2',
         horizontal = True

 model_afroxlmr_base = load_model_pipeline('dsfsi/za-afro-xlmr-base-lid', "model.bin")
 model_afrolm        = load_model_pipeline('dsfsi/za-afrolm-lid', "model.bin")
 za_lid = load_model_pipeline('dsfsi/za-lid-bert', "model.bin")
+openlid = load_model('laurievb/OpenLID', "model.bin")
+glotlid_3 = load_model(constants.MODEL_NAME, "model_v3.bin")
 # @st.cache_resource
 def plot(label, prob):
         model_choice = model_afrolm
     elif version == 'BERT':
         model_choice = za_lid
+    elif version == 'OpenLID':
+        model_choice = openlid
+    elif version == 'GlotLID v3':
+          model_choice = glotlid_3
     else:
+         model_choice = [model_xlmr_large,model_serengeti, model_afriberta, model_afroxlmr_base, model_afrolm, za_lid, openlid,  glotlid_3]
     my_bar = st.progress(0, text=progress_text)
                  all_models_pred = []
                  for model in model_choice:
                             output = model.predict(sent)
+                            if version in  ["openlid-201", "GlotLID v3"]:
+                                    output_label = output[index]['label']
+                                    output_prob =  output[index]['score']
+                                    output_label_language = output[index]['label']
+                                    labels = labels + [output_label]
+                                    probs = probs + [output_prob]
+                                    my_bar.progress(
+                                        min((index) / len(sentences), 1),
+                                        text=progress_text,
+                                    )
+                            else:
+                                    output_label  = output[0][0].split('__')[-1].replace('_Hans', '_Hani').replace('_Hant', '_Hani')
+                                    output_prob = max(min(output[1][0], 1), 0)
+                                    output_label_language = output_label.split('_')[0]
+                                    # script control
+                                    if version in ['GlotLID v3', 'openlid-201', 'nllb-218'] and output_label_language!= 'zxx':
+                                        main_script, all_scripts = get_script(sent)
+                                        output_label_script = output_label.split('_')[1]
+                                        if output_label_script not in all_scripts:
+                                            output_label_script = main_script
+                                            output_label = f"und_{output_label_script}"
+                                            output_prob = 0
+                                    labels = labels + [output_label]
+                                    probs = probs + [output_prob]
+                                    my_bar.progress(
+                                        min((index) / len(sentences), 1),
+                                        text=progress_text,
+                                    )
+        else:
+                output = model_choice.predict(sent)
+                if version not in ["openlid-201", "GlotLID v3"]
+                        output_label = output[index]['label']
+                        output_prob =  output[index]['score']
+                        output_label_language = output[index]['label']
+                        labels = labels + [output_label]
+                        probs = probs + [output_prob]
+                        my_bar.progress(
+                                min((index) / len(sentences), 1),
+                                text=progress_text,
+                            )
+                else:
+                            output_label  = output[0][0].split('__')[-1].replace('_Hans', '_Hani').replace('_Hant', '_Hani')
+                            output_prob = max(min(output[1][0], 1), 0)
+                            output_label_language = output_label.split('_')[0]
+                            # script control
+                            if version in ['GlotLID v3', 'openlid-201', 'nllb-218'] and output_label_language!= 'zxx':
+                                main_script, all_scripts = get_script(sent)
+                                output_label_script = output_label.split('_')[1]
+                                if output_label_script not in all_scripts:
+                                    output_label_script = main_script
+                                    output_label = f"und_{output_label_script}"
+                                    output_prob = 0
                             labels = labels + [output_label]
                             my_bar.progress(
                                 min((index) / len(sentences), 1),
                                 text=progress_text,
                 )
     my_bar.empty()
     return probs, labels
     version = st.radio(
         "Choose model",
+        ["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT", "openlid-201", "GlotLID v3", "All-Models"],
+        captions=["za-XLMR-Large", "za-Serengeti", "za-AfriBERTa", "za-Afro-XLMR-BASE", "za-AfroLM", "za-BERT", "OpenLID", "GlotLID v3",'All-Models'],
         index = 4,
         key = 'version_tab1',
         horizontal = True
         # plot
         if version == "All-Models":
+               plot_multiples(["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT", "OpenLID", "GlotLID v3"], labels, probs)
         else:
                plot(label, prob)
     version = st.radio(
         "Choose model",
+        ["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT","openlid-201", "GlotLID v3", "All-Models"],
+        captions=["za-XLMR-Large", "za-Serengeti", "za-AfriBERTa", "za-Afro-XLMR-BASE", "za-AfroLM", "za-BERT", "OpenLID", "GlotLID v3", "All-Models"],
         index = 4,
         key = 'version_tab2',
         horizontal = True