Spaces:

rcds
/

SwissParlTopicModelling

Running

App Files Files Community

kapllan commited on Sep 10, 2024

Commit

1cdf555

1 Parent(s): 8a95fb3

First commit for migrating the swiss topic modelling space.

Browse files

Files changed (5) hide show

README.md +3 -3
app.py +100 -0
id2label.json +227 -0
install_packages.py +57 -0
requirements.txt +21 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: SwissParlTopicModelling
-emoji: 🦀
-colorFrom: green
 colorTo: red
 sdk: gradio
-sdk_version: 4.43.0
 app_file: app.py
 pinned: false
 ---

 ---
 title: SwissParlTopicModelling
+emoji: 📉
+colorFrom: indigo
 colorTo: red
 sdk: gradio
+sdk_version: 4.32.2
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import json as js
+import os
+import re
+from typing import List
+import fasttext
+import gradio as gr
+import joblib
+import omikuji
+from huggingface_hub import snapshot_download
+from install_packages import download_model
+download_model('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin')
+# Download the model files from Hugging Face
+for repo_id in ['kapllan/omikuji-bonsai-parliament-de-spacy', 'kapllan/omikuji-bonsai-parliament-fr-spacy',
+                'kapllan/omikuji-bonsai-parliament-it-spacy']:
+    if not os.path.exists(repo_id):
+        os.makedirs(repo_id)
+    model_dir = snapshot_download(repo_id=repo_id, local_dir=repo_id)
+lang_model = fasttext.load_model('lid.176.bin')
+with open('./id2label.json', 'r') as f:
+    id2label = js.load(f)
+def map_language(language: str) -> str:
+    language_mapping = {'de': 'German',
+                        'it': 'Italian',
+                        'fr': 'French'}
+    if language in language_mapping.keys():
+        return language_mapping[language]
+    else:
+        return language
+def find_model(language: str):
+    vectorizer, model = None, None
+    if language in ['de', 'fr', 'it']:
+        path_to_vectorizer = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/vectorizer'
+        path_to_model = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/omikuji-model'
+        vectorizer = joblib.load(path_to_vectorizer)
+        model = omikuji.Model.load(path_to_model)
+    return vectorizer, model
+def predict_lang(text: str) -> str:
+    text = re.sub(r'\n', '', text)  # Remove linebreaks because fasttext cannot process that otherwise
+    predictions = lang_model.predict(text, k=1)  # returns top 2 matching languages
+    language = predictions[0][0]  # returns top 2 matching languages
+    language = re.sub(r'__label__', '', language)  # returns top 2 matching languages
+    return language
+def predict_topic(text: str) -> [List[str], str]:
+    results = []
+    language = predict_lang(text)
+    vectorizer, model = find_model(language)
+    language = map_language(language)
+    if vectorizer is not None:
+        texts = [text]
+        vector = vectorizer.transform(texts)
+        for row in vector:
+            if row.nnz == 0:  # All zero vector, empty result
+                continue
+            feature_values = [(col, row[0, col]) for col in row.nonzero()[1]]
+            for subj_id, score in model.predict(feature_values, top_k=1000):
+                results.append((id2label[str(subj_id)], score))
+    return results, language
+def topic_modeling(text: str, threshold: float) -> [List[str], str]:
+    # Prepare labels and scores for the plot
+    sorted_topics, language = predict_topic(text)
+    if len(sorted_topics) > 0 and language in ['German', 'French', 'Italian']:
+        sorted_topics = [t for t in sorted_topics if t[1] >= threshold]
+    else:
+        sorted_topics = []
+    return sorted_topics, language
+with gr.Blocks() as iface:
+    gr.Markdown("# Topic Modeling")
+    gr.Markdown("Enter a document and get each topic along with its score.")
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(lines=10, placeholder="Enter a document")
+            submit_button = gr.Button("Submit")
+            threshold_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Score Threshold", value=0.0)
+            language_text = gr.Textbox(lines=1, placeholder="Detected language will be shown here...",
+                                       interactive=False, label="Detected Language")
+        with gr.Column():
+            output_data = gr.Dataframe(headers=["Label", "Score"])
+    submit_button.click(topic_modeling, inputs=[input_text, threshold_slider], outputs=[output_data, language_text])
+    # Launch the app
+iface.launch(share=True)

id2label.json ADDED Viewed

	@@ -0,0 +1,227 @@

+{
+  "0": "AHV",
+  "1": "Abfall",
+  "2": "Abgabe",
+  "3": "Agrarpolitik",
+  "4": "Agrarproduktion",
+  "5": "Altersfragen",
+  "6": "Arbeit",
+  "7": "Arbeitslosenversicherung",
+  "8": "Arbeitslosigkeit",
+  "9": "Arbeitsmarkt",
+  "10": "Arbeitsrecht",
+  "11": "Armee",
+  "12": "Armut/Ungleichheit",
+  "13": "Artenvielfalt",
+  "14": "Asylpolitik",
+  "15": "Auslandschweizer",
+  "16": "Ausländerpolitik",
+  "17": "Aussenpolitik : Ausland",
+  "18": "Aussenpolitik : Schweiz",
+  "19": "Aussenwirtschaftspolitik",
+  "20": "Bankenkrise",
+  "21": "Bauwesen/Immobilien",
+  "22": "Behinderung",
+  "23": "Berg",
+  "24": "Berufliche Vorsorge",
+  "25": "Berufsbildung",
+  "26": "Beschwerderecht",
+  "27": "Beschäftigung und Arbeit",
+  "28": "Bewaffnung",
+  "29": "Beziehung Schweiz - EU",
+  "30": "Bildung",
+  "31": "Boden",
+  "32": "Bürgerrecht",
+  "33": "Datenschutz",
+  "34": "Demokratie",
+  "35": "Digitalisierung",
+  "36": "Diplomatie",
+  "37": "Diskriminierung",
+  "38": "Elektrizität",
+  "39": "Energie",
+  "40": "Energiepolitik",
+  "41": "Erberecht",
+  "42": "Ernährung",
+  "43": "Erwerbsersatzordnung",
+  "44": "Europapolitik",
+  "45": "Europarat",
+  "46": "Europarecht",
+  "47": "Europäische Union",
+  "48": "Europäisches Parlament",
+  "49": "Exekutive",
+  "50": "Familienfragen",
+  "51": "Familienrecht",
+  "52": "Familienzulage",
+  "53": "Finanzausgleich",
+  "54": "Finanzmarkt",
+  "55": "Finanzplatz",
+  "56": "Finanzrecht",
+  "57": "Finanzwesen",
+  "58": "Flüchtling",
+  "59": "Forschung",
+  "60": "Fortpflanzung",
+  "61": "Fossile Energie",
+  "62": "Föderalismus",
+  "63": "Geld- und Währungspolitik",
+  "64": "Geldwäscherei",
+  "65": "Gentechnologie",
+  "66": "Gerichtswesen",
+  "67": "Geschichte Ausland",
+  "68": "Geschichte Schweiz",
+  "69": "Geschlechterfragen",
+  "70": "Gesellschaftsfragen",
+  "71": "Gesundheit",
+  "72": "Gesundheitspolitik",
+  "73": "Gewalt",
+  "74": "Gewerkschaft",
+  "75": "Globalisierung",
+  "76": "Grenze",
+  "77": "Grundrechte",
+  "78": "Güterverkehr",
+  "79": "Handel",
+  "80": "Heil- und Hilfsmittel",
+  "81": "Informatik",
+  "82": "Information",
+  "83": "Informationswissenschaft",
+  "84": "Internationale Politik",
+  "85": "Internationales Recht",
+  "86": "Internet und soziale Medien",
+  "87": "Interventionspolitik",
+  "88": "Invalidenversicherung",
+  "89": "Jagd und Fischerei",
+  "90": "Kapital",
+  "91": "Katastrophe",
+  "92": "Kernenergie",
+  "93": "Kinder- und Jugendfragen",
+  "94": "Kinderrechte",
+  "95": "Kindes- und Erwachsenenschutzrecht",
+  "96": "Klimafragen",
+  "97": "Konkursrecht",
+  "98": "Konsum",
+  "99": "Korruption",
+  "100": "Krankenversicherung",
+  "101": "Krieg",
+  "102": "Krise",
+  "103": "Kultur",
+  "104": "Landwirtschaft",
+  "105": "Luft",
+  "106": "Luftfahrt",
+  "107": "Lärm",
+  "108": "Medien",
+  "109": "Medien / Kommunikation",
+  "110": "Medienrecht",
+  "111": "Medizinalberuf",
+  "112": "Menschenrechte",
+  "113": "Miet- und Wohnungswesen",
+  "114": "Migration",
+  "115": "Migrationsbewegung",
+  "116": "Mutterschaftsversicherung",
+  "117": "Nationalbank",
+  "118": "Obligationenrecht",
+  "119": "Parlament",
+  "120": "Parlament Ausland",
+  "121": "Parlament Schweiz",
+  "122": "Patient",
+  "123": "Personenrecht",
+  "124": "Pflege",
+  "125": "Post",
+  "126": "Presse",
+  "127": "Privatversicherung",
+  "128": "Produktion",
+  "129": "Radio und Fernsehen",
+  "130": "Rassismus",
+  "131": "Ratsmitglied",
+  "132": "Raumplanung",
+  "133": "Raumplanung und Wohnungswesen",
+  "134": "Recht Allgemein",
+  "135": "Rechte und Freiheiten",
+  "136": "Rechtswissenschaft",
+  "137": "Religionsfragen",
+  "138": "Sachenrecht",
+  "139": "Sans-Papiers",
+  "140": "Schiedsgerichtsbarkeit",
+  "141": "Schienenverkehr",
+  "142": "Schifffahrt",
+  "143": "Schule",
+  "144": "Service public",
+  "145": "Sicherheitspolitik",
+  "146": "Sicherheitspolitik/Friedenspolitik",
+  "147": "Soziale Fragen",
+  "148": "Sozialer Schutz",
+  "149": "Sozialhilfe",
+  "150": "Sozialpolitik",
+  "151": "Sozialversicherung",
+  "152": "Spiel",
+  "153": "Spital",
+  "154": "Sport",
+  "155": "Sprache",
+  "156": "Staat",
+  "157": "Staatspolitik",
+  "158": "Staatssouveränität",
+  "159": "Sterben und Tod",
+  "160": "Steuer",
+  "161": "Steuerhinterziehung",
+  "162": "Steuerrecht",
+  "163": "Steuerwettbewerb",
+  "164": "Stiftung",
+  "165": "Strafprozessordnung",
+  "166": "Strafrecht",
+  "167": "Straftat",
+  "168": "Strassenverkehr",
+  "169": "Sucht",
+  "170": "Telefonie",
+  "171": "Terrorismus",
+  "172": "Tierschutz",
+  "173": "Tierversuch",
+  "174": "Tourismus",
+  "175": "Umwelt",
+  "176": "Umweltpolitik",
+  "177": "Umweltschutz",
+  "178": "Unfallversicherung",
+  "179": "Universität/Hochschule/Fachhochschule",
+  "180": "Unternehmen",
+  "181": "Urheberrecht",
+  "182": "Verfahrensrecht",
+  "183": "Verfassung",
+  "184": "Vergaberecht",
+  "185": "Verkehr",
+  "186": "Verkehrspolitik",
+  "187": "Vertrag",
+  "188": "Verwaltungsrecht",
+  "189": "Volksabstimmung",
+  "190": "Vorrechte und Immunität",
+  "191": "Wahlen",
+  "192": "Wald",
+  "193": "Wasser",
+  "194": "Weiterbildung",
+  "195": "Wettbewerb",
+  "196": "Wirtschaft",
+  "197": "Wirtschaftsleben",
+  "198": "Wirtschaftspolitik",
+  "199": "Wissenschaft / Forschung",
+  "200": "Zivilprozessordnung",
+  "201": "Zivilrecht",
+  "202": "Zivilschutz und Bevölkerungsschutz/Zivildienst",
+  "203": "Zoll",
+  "204": "erneuerbare Energie",
+  "205": "innere Sicherheit",
+  "206": "internationale Beziehungen",
+  "207": "internationale Organisation",
+  "208": "internationale Politik",
+  "209": "internationale Rechtshilfe",
+  "210": "internationale Strafjustiz",
+  "211": "internationale Zusammenarbeit",
+  "212": "internationaler Konflikt",
+  "213": "internationales Abkommen",
+  "214": "internationales Privatrecht",
+  "215": "internationales Recht",
+  "216": "internationales humanitäres Recht",
+  "217": "kantonales Parlament",
+  "218": "politische Partei",
+  "219": "politische Rechte",
+  "220": "politisches Leben",
+  "221": "politisches System",
+  "222": "öffentliche Finanzen",
+  "223": "öffentliche Verwaltung",
+  "224": "öffentlicher Verkehr"
+}

install_packages.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import subprocess
+import sys
+import requests
+def download_model(url, save_path):
+    # Send a GET request to the URL
+    response = requests.get(url, stream=True)
+    # Check if the request was successful (status code 200)
+    if response.status_code == 200:
+        # Open a file in binary write mode to save the downloaded content
+        with open(save_path, 'wb') as f:
+            # Iterate over the response content in chunks and write to the file
+            for chunk in response.iter_content(chunk_size=1024):
+                f.write(chunk)
+        print("Model downloaded successfully!")
+    else:
+        # Print an error message if the request was not successful
+        print(f"Failed to download model. Status code: {response.status_code}")
+def set_tokenizers_parallelism(value):
+    """Set the TOKENIZERS_PARALLELISM environment variable."""
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true' if value else 'false'
+    print(f"TOKENIZERS_PARALLELISM set to {os.environ['TOKENIZERS_PARALLELISM']}")
+def install_requirements():
+    """Install packages listed in requirements.txt"""
+    try:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
+        print("All packages from requirements.txt installed successfully.")
+    except subprocess.CalledProcessError as e:
+        print(f"Failed to install packages from requirements.txt: {e}")
+        sys.exit(1)
+def install_spacy_model(model_name):
+    """Install a specific spaCy model"""
+    try:
+        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
+        print(f"spaCy model '{model_name}' installed successfully.")
+    except subprocess.CalledProcessError as e:
+        print(f"Failed to install spaCy model '{model_name}': {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    install_requirements()
+    install_spacy_model("de_core_news_lg")
+    install_spacy_model("fr_core_news_lg")
+    install_spacy_model("it_core_news_lg")
+    set_tokenizers_parallelism(True)
+    download_model('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin')

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+Requests==2.32.2
+annif
+beautifulsoup4==4.12.3
+datasets==2.14.5
+fasttext==0.9.2
+gradio
+iterative_stratification==0.1.7
+nltk==3.8.1
+numpy==1.24.4
+omikuji==0.5.1
+openpyxl
+pandas==2.2.2
+pytz==2023.3.post1
+scikit_learn==1.3.2
+sentence_transformers==2.2.2
+swissparlpy==0.3.0
+tqdm==4.66.1
+transformers==4.39.3
+spacy==3.7.4
+huggingface_hub
+requests