Prabin Bhandari commited on
Commit
31cd311
·
1 Parent(s): 1ea30cb

Use list of words

Browse files
Files changed (1) hide show
  1. cooccurrence_count.py +43 -44
cooccurrence_count.py CHANGED
@@ -34,17 +34,14 @@ _KWARGS_DESCRIPTION = """
34
  Calculates the co-occurence of two words in each sentence.
35
  Args:
36
  `data`: a list of `str` which containes a dataset.
37
- `word1`: The first word.
38
- `word2`: The second word.
39
  Returns:
40
- count: The count of total sentences.
41
- co_occurrence_count: The co-occurrence count of word1 and word2 in data.
42
  Examples:
43
  >>> data = ["hello sun","hello moon", "hello sun"]
44
  >>> c_count = evaluate.load("prb977/cooccurrence_count")
45
- >>> results = c_count.compute(data=data, word1='hello', word2='sun')
46
  >>> print(results)
47
- {'count': 3, 'co_occurrence_count': 2}
48
  """
49
 
50
 
@@ -87,41 +84,43 @@ class CooccurrenceCount(evaluate.Measurement):
87
  def _download_and_prepare(self, dl_manager):
88
  stanza.download('en', processors='tokenize')
89
 
90
- def _compute(self, data, word1, word2):
91
- len1 = len(stanza_tokenizer(word1))
92
- len2 = len(stanza_tokenizer(word2))
93
- if len1 > len2:
94
- ugram = len1
95
- lgram = len2
96
- elif len1 < len2:
97
- ugram = len2
98
- lgram = len1
99
- else:
100
- ugram = len1
101
- lgram = len1
102
-
103
- v = CountVectorizer(
104
- ngram_range=(lgram, ugram),
105
- tokenizer=stanza_tokenizer,
106
- lowercase=True
107
- )
108
- analyzer = v.build_analyzer()
109
- vectorizer = CountVectorizer(
110
- ngram_range=(lgram, ugram),
111
- vocabulary={
112
- analyzer(word1)[-1]: 0,
113
- analyzer(word2)[-1]: 1
114
- },
115
- tokenizer=stanza_tokenizer,
116
- lowercase=True
117
- )
118
- co_occurrences = vectorizer.fit_transform(data)
119
- dense_mat = co_occurrences.todense()
120
- count = len(data)
121
- co_occurrence_count = np.sum(
122
- np.apply_along_axis(check_count, axis=1, arr=dense_mat)
123
- )
124
- return {
125
- "cout": count,
126
- "co_occurrence_count": co_occurrence_count,
127
- }
 
 
 
34
  Calculates the co-occurence of two words in each sentence.
35
  Args:
36
  `data`: a list of `str` which containes a dataset.
37
+ `words`: list of list of two words that we want to check for
 
38
  Returns:
 
 
39
  Examples:
40
  >>> data = ["hello sun","hello moon", "hello sun"]
41
  >>> c_count = evaluate.load("prb977/cooccurrence_count")
42
+ >>> results = c_count.compute(data=data, words=[['hello','sun']\)
43
  >>> print(results)
44
+ [['hello','sun',3,2]]
45
  """
46
 
47
 
 
84
  def _download_and_prepare(self, dl_manager):
85
  stanza.download('en', processors='tokenize')
86
 
87
+ def _compute(self, data, words):
88
+ for each in words:
89
+ word1 = each[0]
90
+ word2 = each[1]
91
+ len1 = len(stanza_tokenizer(word1))
92
+ len2 = len(stanza_tokenizer(word2))
93
+ if len1 > len2:
94
+ ugram = len1
95
+ lgram = len2
96
+ elif len1 < len2:
97
+ ugram = len2
98
+ lgram = len1
99
+ else:
100
+ ugram = len1
101
+ lgram = len1
102
+
103
+ v = CountVectorizer(
104
+ ngram_range=(lgram, ugram),
105
+ tokenizer=stanza_tokenizer,
106
+ lowercase=True
107
+ )
108
+ analyzer = v.build_analyzer()
109
+ vectorizer = CountVectorizer(
110
+ ngram_range=(lgram, ugram),
111
+ vocabulary={
112
+ analyzer(word1)[-1]: 0,
113
+ analyzer(word2)[-1]: 1
114
+ },
115
+ tokenizer=stanza_tokenizer,
116
+ lowercase=True
117
+ )
118
+ co_occurrences = vectorizer.fit_transform(data)
119
+ dense_mat = co_occurrences.todense()
120
+ count = len(data)
121
+ co_occurrence_count = np.sum(
122
+ np.apply_along_axis(check_count, axis=1, arr=dense_mat)
123
+ )
124
+ each.append(count)
125
+ each.append(co_occurrence_count)
126
+ return words