Mark7549 commited on
Commit
e99824c
·
1 Parent(s): eb0e921

added function to count the occurrences of lemmas per time slice

Browse files
Files changed (2) hide show
  1. app.py +6 -1
  2. word2vec.py +17 -1
app.py CHANGED
@@ -30,16 +30,21 @@ def load_models_for_word_dict():
30
  def load_all_lemmas():
31
  return load_compressed_word_list('all_lemmas.pkl.gz')
32
 
 
 
 
 
33
  # Load compressed word list
34
  all_models_words = load_all_models_words()
35
 
36
-
37
  # Prepare lsj dictionary
38
  lemma_dict = load_lsj_dict()
39
 
40
  # Load dictionary with words as keys and eligible models as values
41
  models_for_word_dict = load_models_for_word_dict()
42
 
 
 
43
 
44
  # Set styles for menu
45
  styles = {
 
30
  def load_all_lemmas():
31
  return load_compressed_word_list('all_lemmas.pkl.gz')
32
 
33
+ @st.cache_data
34
+ def load_lemma_count_dict():
35
+ return count_lemmas('lemma_list_raw')
36
+
37
  # Load compressed word list
38
  all_models_words = load_all_models_words()
39
 
 
40
  # Prepare lsj dictionary
41
  lemma_dict = load_lsj_dict()
42
 
43
  # Load dictionary with words as keys and eligible models as values
44
  models_for_word_dict = load_models_for_word_dict()
45
 
46
+ lemma_counts = load_lemma_count_dict()
47
+
48
 
49
  # Set styles for menu
50
  styles = {
word2vec.py CHANGED
@@ -8,6 +8,7 @@ import xlsxwriter
8
  from sklearn.preprocessing import StandardScaler
9
  from sklearn.manifold import TSNE
10
  import plotly.express as px
 
11
 
12
 
13
 
@@ -457,6 +458,21 @@ def print_3d_model(model_name):
457
  print(f'{word}: {vector}')
458
 
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
 
461
  def main():
462
  # model = load_word2vec_model('models/archaic_cbow.model')
@@ -481,7 +497,7 @@ def main():
481
  # Iterate over all words and print their vectors
482
  # iterate_over_words(model)
483
 
484
- print_3d_model('archaic')
485
 
486
 
487
  if __name__ == "__main__":
 
8
  from sklearn.preprocessing import StandardScaler
9
  from sklearn.manifold import TSNE
10
  import plotly.express as px
11
+ from collections import Counter
12
 
13
 
14
 
 
458
  print(f'{word}: {vector}')
459
 
460
 
461
+ def count_lemmas(directory):
462
+ """
463
+ Create a Counter with all words and their occurences for all models
464
+ """
465
+ lemma_count_dict = {}
466
+ for file in os.listdir(directory):
467
+ if file.endswith(".txt"):
468
+ with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
469
+ text = f.read()
470
+ words = text.split()
471
+ lemma_count_dict[file] = Counter(words)
472
+
473
+ return lemma_count_dict
474
+
475
+
476
 
477
  def main():
478
  # model = load_word2vec_model('models/archaic_cbow.model')
 
497
  # Iterate over all words and print their vectors
498
  # iterate_over_words(model)
499
 
500
+ count_lemmas('lemma_list_raw')
501
 
502
 
503
  if __name__ == "__main__":