added function to count the occurrences of lemmas per time slice
Browse files- app.py +6 -1
- word2vec.py +17 -1
app.py
CHANGED
@@ -30,16 +30,21 @@ def load_models_for_word_dict():
|
|
30 |
def load_all_lemmas():
|
31 |
return load_compressed_word_list('all_lemmas.pkl.gz')
|
32 |
|
|
|
|
|
|
|
|
|
33 |
# Load compressed word list
|
34 |
all_models_words = load_all_models_words()
|
35 |
|
36 |
-
|
37 |
# Prepare lsj dictionary
|
38 |
lemma_dict = load_lsj_dict()
|
39 |
|
40 |
# Load dictionary with words as keys and eligible models as values
|
41 |
models_for_word_dict = load_models_for_word_dict()
|
42 |
|
|
|
|
|
43 |
|
44 |
# Set styles for menu
|
45 |
styles = {
|
|
|
30 |
def load_all_lemmas():
|
31 |
return load_compressed_word_list('all_lemmas.pkl.gz')
|
32 |
|
33 |
+
@st.cache_data
|
34 |
+
def load_lemma_count_dict():
|
35 |
+
return count_lemmas('lemma_list_raw')
|
36 |
+
|
37 |
# Load compressed word list
|
38 |
all_models_words = load_all_models_words()
|
39 |
|
|
|
40 |
# Prepare lsj dictionary
|
41 |
lemma_dict = load_lsj_dict()
|
42 |
|
43 |
# Load dictionary with words as keys and eligible models as values
|
44 |
models_for_word_dict = load_models_for_word_dict()
|
45 |
|
46 |
+
lemma_counts = load_lemma_count_dict()
|
47 |
+
|
48 |
|
49 |
# Set styles for menu
|
50 |
styles = {
|
word2vec.py
CHANGED
@@ -8,6 +8,7 @@ import xlsxwriter
|
|
8 |
from sklearn.preprocessing import StandardScaler
|
9 |
from sklearn.manifold import TSNE
|
10 |
import plotly.express as px
|
|
|
11 |
|
12 |
|
13 |
|
@@ -457,6 +458,21 @@ def print_3d_model(model_name):
|
|
457 |
print(f'{word}: {vector}')
|
458 |
|
459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
|
461 |
def main():
|
462 |
# model = load_word2vec_model('models/archaic_cbow.model')
|
@@ -481,7 +497,7 @@ def main():
|
|
481 |
# Iterate over all words and print their vectors
|
482 |
# iterate_over_words(model)
|
483 |
|
484 |
-
|
485 |
|
486 |
|
487 |
if __name__ == "__main__":
|
|
|
8 |
from sklearn.preprocessing import StandardScaler
|
9 |
from sklearn.manifold import TSNE
|
10 |
import plotly.express as px
|
11 |
+
from collections import Counter
|
12 |
|
13 |
|
14 |
|
|
|
458 |
print(f'{word}: {vector}')
|
459 |
|
460 |
|
461 |
+
def count_lemmas(directory):
|
462 |
+
"""
|
463 |
+
Create a Counter with all words and their occurences for all models
|
464 |
+
"""
|
465 |
+
lemma_count_dict = {}
|
466 |
+
for file in os.listdir(directory):
|
467 |
+
if file.endswith(".txt"):
|
468 |
+
with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
|
469 |
+
text = f.read()
|
470 |
+
words = text.split()
|
471 |
+
lemma_count_dict[file] = Counter(words)
|
472 |
+
|
473 |
+
return lemma_count_dict
|
474 |
+
|
475 |
+
|
476 |
|
477 |
def main():
|
478 |
# model = load_word2vec_model('models/archaic_cbow.model')
|
|
|
497 |
# Iterate over all words and print their vectors
|
498 |
# iterate_over_words(model)
|
499 |
|
500 |
+
count_lemmas('lemma_list_raw')
|
501 |
|
502 |
|
503 |
if __name__ == "__main__":
|