kapllan commited on
Commit
1cdf555
1 Parent(s): 8a95fb3

First commit for migrating the swiss topic modelling space.

Browse files
Files changed (5) hide show
  1. README.md +3 -3
  2. app.py +100 -0
  3. id2label.json +227 -0
  4. install_packages.py +57 -0
  5. requirements.txt +21 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: SwissParlTopicModelling
3
- emoji: 🦀
4
- colorFrom: green
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.43.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
  title: SwissParlTopicModelling
3
+ emoji: 📉
4
+ colorFrom: indigo
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 4.32.2
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json as js
2
+ import os
3
+ import re
4
+ from typing import List
5
+
6
+ import fasttext
7
+ import gradio as gr
8
+ import joblib
9
+ import omikuji
10
+ from huggingface_hub import snapshot_download
11
+ from install_packages import download_model
12
+
13
+ download_model('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin')
14
+
15
+ # Download the model files from Hugging Face
16
+ for repo_id in ['kapllan/omikuji-bonsai-parliament-de-spacy', 'kapllan/omikuji-bonsai-parliament-fr-spacy',
17
+ 'kapllan/omikuji-bonsai-parliament-it-spacy']:
18
+ if not os.path.exists(repo_id):
19
+ os.makedirs(repo_id)
20
+ model_dir = snapshot_download(repo_id=repo_id, local_dir=repo_id)
21
+
22
+ lang_model = fasttext.load_model('lid.176.bin')
23
+
24
+ with open('./id2label.json', 'r') as f:
25
+ id2label = js.load(f)
26
+
27
+
28
+ def map_language(language: str) -> str:
29
+ language_mapping = {'de': 'German',
30
+ 'it': 'Italian',
31
+ 'fr': 'French'}
32
+ if language in language_mapping.keys():
33
+ return language_mapping[language]
34
+ else:
35
+ return language
36
+
37
+
38
+ def find_model(language: str):
39
+ vectorizer, model = None, None
40
+ if language in ['de', 'fr', 'it']:
41
+ path_to_vectorizer = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/vectorizer'
42
+ path_to_model = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/omikuji-model'
43
+ vectorizer = joblib.load(path_to_vectorizer)
44
+ model = omikuji.Model.load(path_to_model)
45
+ return vectorizer, model
46
+
47
+
48
+ def predict_lang(text: str) -> str:
49
+ text = re.sub(r'\n', '', text) # Remove linebreaks because fasttext cannot process that otherwise
50
+ predictions = lang_model.predict(text, k=1) # returns top 2 matching languages
51
+ language = predictions[0][0] # returns top 2 matching languages
52
+ language = re.sub(r'__label__', '', language) # returns top 2 matching languages
53
+ return language
54
+
55
+
56
+ def predict_topic(text: str) -> [List[str], str]:
57
+ results = []
58
+ language = predict_lang(text)
59
+ vectorizer, model = find_model(language)
60
+ language = map_language(language)
61
+ if vectorizer is not None:
62
+ texts = [text]
63
+ vector = vectorizer.transform(texts)
64
+ for row in vector:
65
+ if row.nnz == 0: # All zero vector, empty result
66
+ continue
67
+ feature_values = [(col, row[0, col]) for col in row.nonzero()[1]]
68
+ for subj_id, score in model.predict(feature_values, top_k=1000):
69
+ results.append((id2label[str(subj_id)], score))
70
+ return results, language
71
+
72
+
73
+ def topic_modeling(text: str, threshold: float) -> [List[str], str]:
74
+ # Prepare labels and scores for the plot
75
+ sorted_topics, language = predict_topic(text)
76
+ if len(sorted_topics) > 0 and language in ['German', 'French', 'Italian']:
77
+ sorted_topics = [t for t in sorted_topics if t[1] >= threshold]
78
+ else:
79
+ sorted_topics = []
80
+ return sorted_topics, language
81
+
82
+
83
+ with gr.Blocks() as iface:
84
+ gr.Markdown("# Topic Modeling")
85
+ gr.Markdown("Enter a document and get each topic along with its score.")
86
+
87
+ with gr.Row():
88
+ with gr.Column():
89
+ input_text = gr.Textbox(lines=10, placeholder="Enter a document")
90
+ submit_button = gr.Button("Submit")
91
+ threshold_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Score Threshold", value=0.0)
92
+ language_text = gr.Textbox(lines=1, placeholder="Detected language will be shown here...",
93
+ interactive=False, label="Detected Language")
94
+ with gr.Column():
95
+ output_data = gr.Dataframe(headers=["Label", "Score"])
96
+
97
+ submit_button.click(topic_modeling, inputs=[input_text, threshold_slider], outputs=[output_data, language_text])
98
+
99
+ # Launch the app
100
+ iface.launch(share=True)
id2label.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "AHV",
3
+ "1": "Abfall",
4
+ "2": "Abgabe",
5
+ "3": "Agrarpolitik",
6
+ "4": "Agrarproduktion",
7
+ "5": "Altersfragen",
8
+ "6": "Arbeit",
9
+ "7": "Arbeitslosenversicherung",
10
+ "8": "Arbeitslosigkeit",
11
+ "9": "Arbeitsmarkt",
12
+ "10": "Arbeitsrecht",
13
+ "11": "Armee",
14
+ "12": "Armut/Ungleichheit",
15
+ "13": "Artenvielfalt",
16
+ "14": "Asylpolitik",
17
+ "15": "Auslandschweizer",
18
+ "16": "Ausländerpolitik",
19
+ "17": "Aussenpolitik : Ausland",
20
+ "18": "Aussenpolitik : Schweiz",
21
+ "19": "Aussenwirtschaftspolitik",
22
+ "20": "Bankenkrise",
23
+ "21": "Bauwesen/Immobilien",
24
+ "22": "Behinderung",
25
+ "23": "Berg",
26
+ "24": "Berufliche Vorsorge",
27
+ "25": "Berufsbildung",
28
+ "26": "Beschwerderecht",
29
+ "27": "Beschäftigung und Arbeit",
30
+ "28": "Bewaffnung",
31
+ "29": "Beziehung Schweiz - EU",
32
+ "30": "Bildung",
33
+ "31": "Boden",
34
+ "32": "Bürgerrecht",
35
+ "33": "Datenschutz",
36
+ "34": "Demokratie",
37
+ "35": "Digitalisierung",
38
+ "36": "Diplomatie",
39
+ "37": "Diskriminierung",
40
+ "38": "Elektrizität",
41
+ "39": "Energie",
42
+ "40": "Energiepolitik",
43
+ "41": "Erberecht",
44
+ "42": "Ernährung",
45
+ "43": "Erwerbsersatzordnung",
46
+ "44": "Europapolitik",
47
+ "45": "Europarat",
48
+ "46": "Europarecht",
49
+ "47": "Europäische Union",
50
+ "48": "Europäisches Parlament",
51
+ "49": "Exekutive",
52
+ "50": "Familienfragen",
53
+ "51": "Familienrecht",
54
+ "52": "Familienzulage",
55
+ "53": "Finanzausgleich",
56
+ "54": "Finanzmarkt",
57
+ "55": "Finanzplatz",
58
+ "56": "Finanzrecht",
59
+ "57": "Finanzwesen",
60
+ "58": "Flüchtling",
61
+ "59": "Forschung",
62
+ "60": "Fortpflanzung",
63
+ "61": "Fossile Energie",
64
+ "62": "Föderalismus",
65
+ "63": "Geld- und Währungspolitik",
66
+ "64": "Geldwäscherei",
67
+ "65": "Gentechnologie",
68
+ "66": "Gerichtswesen",
69
+ "67": "Geschichte Ausland",
70
+ "68": "Geschichte Schweiz",
71
+ "69": "Geschlechterfragen",
72
+ "70": "Gesellschaftsfragen",
73
+ "71": "Gesundheit",
74
+ "72": "Gesundheitspolitik",
75
+ "73": "Gewalt",
76
+ "74": "Gewerkschaft",
77
+ "75": "Globalisierung",
78
+ "76": "Grenze",
79
+ "77": "Grundrechte",
80
+ "78": "Güterverkehr",
81
+ "79": "Handel",
82
+ "80": "Heil- und Hilfsmittel",
83
+ "81": "Informatik",
84
+ "82": "Information",
85
+ "83": "Informationswissenschaft",
86
+ "84": "Internationale Politik",
87
+ "85": "Internationales Recht",
88
+ "86": "Internet und soziale Medien",
89
+ "87": "Interventionspolitik",
90
+ "88": "Invalidenversicherung",
91
+ "89": "Jagd und Fischerei",
92
+ "90": "Kapital",
93
+ "91": "Katastrophe",
94
+ "92": "Kernenergie",
95
+ "93": "Kinder- und Jugendfragen",
96
+ "94": "Kinderrechte",
97
+ "95": "Kindes- und Erwachsenenschutzrecht",
98
+ "96": "Klimafragen",
99
+ "97": "Konkursrecht",
100
+ "98": "Konsum",
101
+ "99": "Korruption",
102
+ "100": "Krankenversicherung",
103
+ "101": "Krieg",
104
+ "102": "Krise",
105
+ "103": "Kultur",
106
+ "104": "Landwirtschaft",
107
+ "105": "Luft",
108
+ "106": "Luftfahrt",
109
+ "107": "Lärm",
110
+ "108": "Medien",
111
+ "109": "Medien / Kommunikation",
112
+ "110": "Medienrecht",
113
+ "111": "Medizinalberuf",
114
+ "112": "Menschenrechte",
115
+ "113": "Miet- und Wohnungswesen",
116
+ "114": "Migration",
117
+ "115": "Migrationsbewegung",
118
+ "116": "Mutterschaftsversicherung",
119
+ "117": "Nationalbank",
120
+ "118": "Obligationenrecht",
121
+ "119": "Parlament",
122
+ "120": "Parlament Ausland",
123
+ "121": "Parlament Schweiz",
124
+ "122": "Patient",
125
+ "123": "Personenrecht",
126
+ "124": "Pflege",
127
+ "125": "Post",
128
+ "126": "Presse",
129
+ "127": "Privatversicherung",
130
+ "128": "Produktion",
131
+ "129": "Radio und Fernsehen",
132
+ "130": "Rassismus",
133
+ "131": "Ratsmitglied",
134
+ "132": "Raumplanung",
135
+ "133": "Raumplanung und Wohnungswesen",
136
+ "134": "Recht Allgemein",
137
+ "135": "Rechte und Freiheiten",
138
+ "136": "Rechtswissenschaft",
139
+ "137": "Religionsfragen",
140
+ "138": "Sachenrecht",
141
+ "139": "Sans-Papiers",
142
+ "140": "Schiedsgerichtsbarkeit",
143
+ "141": "Schienenverkehr",
144
+ "142": "Schifffahrt",
145
+ "143": "Schule",
146
+ "144": "Service public",
147
+ "145": "Sicherheitspolitik",
148
+ "146": "Sicherheitspolitik/Friedenspolitik",
149
+ "147": "Soziale Fragen",
150
+ "148": "Sozialer Schutz",
151
+ "149": "Sozialhilfe",
152
+ "150": "Sozialpolitik",
153
+ "151": "Sozialversicherung",
154
+ "152": "Spiel",
155
+ "153": "Spital",
156
+ "154": "Sport",
157
+ "155": "Sprache",
158
+ "156": "Staat",
159
+ "157": "Staatspolitik",
160
+ "158": "Staatssouveränität",
161
+ "159": "Sterben und Tod",
162
+ "160": "Steuer",
163
+ "161": "Steuerhinterziehung",
164
+ "162": "Steuerrecht",
165
+ "163": "Steuerwettbewerb",
166
+ "164": "Stiftung",
167
+ "165": "Strafprozessordnung",
168
+ "166": "Strafrecht",
169
+ "167": "Straftat",
170
+ "168": "Strassenverkehr",
171
+ "169": "Sucht",
172
+ "170": "Telefonie",
173
+ "171": "Terrorismus",
174
+ "172": "Tierschutz",
175
+ "173": "Tierversuch",
176
+ "174": "Tourismus",
177
+ "175": "Umwelt",
178
+ "176": "Umweltpolitik",
179
+ "177": "Umweltschutz",
180
+ "178": "Unfallversicherung",
181
+ "179": "Universität/Hochschule/Fachhochschule",
182
+ "180": "Unternehmen",
183
+ "181": "Urheberrecht",
184
+ "182": "Verfahrensrecht",
185
+ "183": "Verfassung",
186
+ "184": "Vergaberecht",
187
+ "185": "Verkehr",
188
+ "186": "Verkehrspolitik",
189
+ "187": "Vertrag",
190
+ "188": "Verwaltungsrecht",
191
+ "189": "Volksabstimmung",
192
+ "190": "Vorrechte und Immunität",
193
+ "191": "Wahlen",
194
+ "192": "Wald",
195
+ "193": "Wasser",
196
+ "194": "Weiterbildung",
197
+ "195": "Wettbewerb",
198
+ "196": "Wirtschaft",
199
+ "197": "Wirtschaftsleben",
200
+ "198": "Wirtschaftspolitik",
201
+ "199": "Wissenschaft / Forschung",
202
+ "200": "Zivilprozessordnung",
203
+ "201": "Zivilrecht",
204
+ "202": "Zivilschutz und Bevölkerungsschutz/Zivildienst",
205
+ "203": "Zoll",
206
+ "204": "erneuerbare Energie",
207
+ "205": "innere Sicherheit",
208
+ "206": "internationale Beziehungen",
209
+ "207": "internationale Organisation",
210
+ "208": "internationale Politik",
211
+ "209": "internationale Rechtshilfe",
212
+ "210": "internationale Strafjustiz",
213
+ "211": "internationale Zusammenarbeit",
214
+ "212": "internationaler Konflikt",
215
+ "213": "internationales Abkommen",
216
+ "214": "internationales Privatrecht",
217
+ "215": "internationales Recht",
218
+ "216": "internationales humanitäres Recht",
219
+ "217": "kantonales Parlament",
220
+ "218": "politische Partei",
221
+ "219": "politische Rechte",
222
+ "220": "politisches Leben",
223
+ "221": "politisches System",
224
+ "222": "öffentliche Finanzen",
225
+ "223": "öffentliche Verwaltung",
226
+ "224": "öffentlicher Verkehr"
227
+ }
install_packages.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import sys
4
+
5
+ import requests
6
+
7
+
8
+ def download_model(url, save_path):
9
+ # Send a GET request to the URL
10
+ response = requests.get(url, stream=True)
11
+
12
+ # Check if the request was successful (status code 200)
13
+ if response.status_code == 200:
14
+ # Open a file in binary write mode to save the downloaded content
15
+ with open(save_path, 'wb') as f:
16
+ # Iterate over the response content in chunks and write to the file
17
+ for chunk in response.iter_content(chunk_size=1024):
18
+ f.write(chunk)
19
+ print("Model downloaded successfully!")
20
+ else:
21
+ # Print an error message if the request was not successful
22
+ print(f"Failed to download model. Status code: {response.status_code}")
23
+
24
+
25
+ def set_tokenizers_parallelism(value):
26
+ """Set the TOKENIZERS_PARALLELISM environment variable."""
27
+ os.environ['TOKENIZERS_PARALLELISM'] = 'true' if value else 'false'
28
+ print(f"TOKENIZERS_PARALLELISM set to {os.environ['TOKENIZERS_PARALLELISM']}")
29
+
30
+
31
+ def install_requirements():
32
+ """Install packages listed in requirements.txt"""
33
+ try:
34
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
35
+ print("All packages from requirements.txt installed successfully.")
36
+ except subprocess.CalledProcessError as e:
37
+ print(f"Failed to install packages from requirements.txt: {e}")
38
+ sys.exit(1)
39
+
40
+
41
+ def install_spacy_model(model_name):
42
+ """Install a specific spaCy model"""
43
+ try:
44
+ subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
45
+ print(f"spaCy model '{model_name}' installed successfully.")
46
+ except subprocess.CalledProcessError as e:
47
+ print(f"Failed to install spaCy model '{model_name}': {e}")
48
+ sys.exit(1)
49
+
50
+
51
+ if __name__ == "__main__":
52
+ install_requirements()
53
+ install_spacy_model("de_core_news_lg")
54
+ install_spacy_model("fr_core_news_lg")
55
+ install_spacy_model("it_core_news_lg")
56
+ set_tokenizers_parallelism(True)
57
+ download_model('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin')
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Requests==2.32.2
2
+ annif
3
+ beautifulsoup4==4.12.3
4
+ datasets==2.14.5
5
+ fasttext==0.9.2
6
+ gradio
7
+ iterative_stratification==0.1.7
8
+ nltk==3.8.1
9
+ numpy==1.24.4
10
+ omikuji==0.5.1
11
+ openpyxl
12
+ pandas==2.2.2
13
+ pytz==2023.3.post1
14
+ scikit_learn==1.3.2
15
+ sentence_transformers==2.2.2
16
+ swissparlpy==0.3.0
17
+ tqdm==4.66.1
18
+ transformers==4.39.3
19
+ spacy==3.7.4
20
+ huggingface_hub
21
+ requests