Spaces:

prb977
/

cooccurrence_count

Runtime error

App Files Files Community

Prabin Bhandari commited on Aug 25, 2022

Commit

31cd311

1 Parent(s): 1ea30cb

Use list of words

Browse files

Files changed (1) hide show

cooccurrence_count.py +43 -44

cooccurrence_count.py CHANGED Viewed

@@ -34,17 +34,14 @@ _KWARGS_DESCRIPTION = """
 Calculates the co-occurence of two words in each sentence.
 Args:
     `data`: a list of `str` which containes a dataset.
-    `word1`: The first word.
-    `word2`: The second word.
 Returns:
-    count: The count of total sentences.
-    co_occurrence_count: The co-occurrence count of word1 and word2 in data.
 Examples:
     >>> data = ["hello sun","hello moon", "hello sun"]
     >>> c_count = evaluate.load("prb977/cooccurrence_count")
-    >>> results = c_count.compute(data=data, word1='hello', word2='sun')
     >>> print(results)
-    {'count': 3, 'co_occurrence_count': 2}
 """
@@ -87,41 +84,43 @@ class CooccurrenceCount(evaluate.Measurement):
     def _download_and_prepare(self, dl_manager):
         stanza.download('en', processors='tokenize')
-    def _compute(self, data, word1, word2):
-        len1 = len(stanza_tokenizer(word1))
-        len2 = len(stanza_tokenizer(word2))
-        if len1 > len2:
-            ugram = len1
-            lgram = len2
-        elif len1 < len2:
-            ugram = len2
-            lgram = len1
-        else:
-            ugram = len1
-            lgram = len1
-        v = CountVectorizer(
-            ngram_range=(lgram, ugram),
-            tokenizer=stanza_tokenizer,
-            lowercase=True
-        )
-        analyzer = v.build_analyzer()
-        vectorizer = CountVectorizer(
-            ngram_range=(lgram, ugram),
-            vocabulary={
-                analyzer(word1)[-1]: 0,
-                analyzer(word2)[-1]: 1
-            },
-            tokenizer=stanza_tokenizer,
-            lowercase=True
-        )
-        co_occurrences = vectorizer.fit_transform(data)
-        dense_mat = co_occurrences.todense()
-        count = len(data)
-        co_occurrence_count = np.sum(
-            np.apply_along_axis(check_count, axis=1, arr=dense_mat)
-        )
-        return {
-            "cout": count,
-            "co_occurrence_count": co_occurrence_count,
-        }

 Calculates the co-occurence of two words in each sentence.
 Args:
     `data`: a list of `str` which containes a dataset.
+    `words`: list of list of two words that we want to check for
 Returns:
 Examples:
     >>> data = ["hello sun","hello moon", "hello sun"]
     >>> c_count = evaluate.load("prb977/cooccurrence_count")
+    >>> results = c_count.compute(data=data, words=[['hello','sun']\)
     >>> print(results)
+    [['hello','sun',3,2]]
 """
     def _download_and_prepare(self, dl_manager):
         stanza.download('en', processors='tokenize')
+    def _compute(self, data, words):
+        for each in words:
+            word1 = each[0]
+            word2 = each[1]
+            len1 = len(stanza_tokenizer(word1))
+            len2 = len(stanza_tokenizer(word2))
+            if len1 > len2:
+                ugram = len1
+                lgram = len2
+            elif len1 < len2:
+                ugram = len2
+                lgram = len1
+            else:
+                ugram = len1
+                lgram = len1
+            v = CountVectorizer(
+                ngram_range=(lgram, ugram),
+                tokenizer=stanza_tokenizer,
+                lowercase=True
+            )
+            analyzer = v.build_analyzer()
+            vectorizer = CountVectorizer(
+                ngram_range=(lgram, ugram),
+                vocabulary={
+                    analyzer(word1)[-1]: 0,
+                    analyzer(word2)[-1]: 1
+                },
+                tokenizer=stanza_tokenizer,
+                lowercase=True
+            )
+            co_occurrences = vectorizer.fit_transform(data)
+            dense_mat = co_occurrences.todense()
+            count = len(data)
+            co_occurrence_count = np.sum(
+                np.apply_along_axis(check_count, axis=1, arr=dense_mat)
+            )
+            each.append(count)
+            each.append(co_occurrence_count)
+        return words