Spaces:
Runtime error
Runtime error
Prabin Bhandari
commited on
Commit
·
1ea30cb
1
Parent(s):
2c03ef4
Some changes
Browse files- cooccurrence_count.py +26 -4
cooccurrence_count.py
CHANGED
@@ -21,6 +21,7 @@ import evaluate
|
|
21 |
import datasets
|
22 |
from sklearn.feature_extraction.text import CountVectorizer
|
23 |
import numpy as np
|
|
|
24 |
|
25 |
|
26 |
_DESCRIPTION = """\
|
@@ -53,6 +54,18 @@ def check_count(x):
|
|
53 |
return 1
|
54 |
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
@evaluate.utils.file_utils.add_start_docstrings(
|
57 |
_DESCRIPTION,
|
58 |
_KWARGS_DESCRIPTION
|
@@ -71,9 +84,12 @@ class CooccurrenceCount(evaluate.Measurement):
|
|
71 |
}),
|
72 |
)
|
73 |
|
|
|
|
|
|
|
74 |
def _compute(self, data, word1, word2):
|
75 |
-
len1 = len(word1
|
76 |
-
len2 = len(word2
|
77 |
if len1 > len2:
|
78 |
ugram = len1
|
79 |
lgram = len2
|
@@ -84,14 +100,20 @@ class CooccurrenceCount(evaluate.Measurement):
|
|
84 |
ugram = len1
|
85 |
lgram = len1
|
86 |
|
87 |
-
v = CountVectorizer(
|
|
|
|
|
|
|
|
|
88 |
analyzer = v.build_analyzer()
|
89 |
vectorizer = CountVectorizer(
|
90 |
ngram_range=(lgram, ugram),
|
91 |
vocabulary={
|
92 |
analyzer(word1)[-1]: 0,
|
93 |
analyzer(word2)[-1]: 1
|
94 |
-
}
|
|
|
|
|
95 |
)
|
96 |
co_occurrences = vectorizer.fit_transform(data)
|
97 |
dense_mat = co_occurrences.todense()
|
|
|
21 |
import datasets
|
22 |
from sklearn.feature_extraction.text import CountVectorizer
|
23 |
import numpy as np
|
24 |
+
import stanza
|
25 |
|
26 |
|
27 |
_DESCRIPTION = """\
|
|
|
54 |
return 1
|
55 |
|
56 |
|
57 |
+
nlp = stanza.Pipeline(lang='en', processors='tokenize')
|
58 |
+
|
59 |
+
|
60 |
+
def stanza_tokenizer(sen):
|
61 |
+
doc = nlp(sen)
|
62 |
+
tokens = []
|
63 |
+
for sen in doc.sentences:
|
64 |
+
for token in sen.tokens:
|
65 |
+
tokens.append(token.text)
|
66 |
+
return tokens
|
67 |
+
|
68 |
+
|
69 |
@evaluate.utils.file_utils.add_start_docstrings(
|
70 |
_DESCRIPTION,
|
71 |
_KWARGS_DESCRIPTION
|
|
|
84 |
}),
|
85 |
)
|
86 |
|
87 |
+
def _download_and_prepare(self, dl_manager):
|
88 |
+
stanza.download('en', processors='tokenize')
|
89 |
+
|
90 |
def _compute(self, data, word1, word2):
|
91 |
+
len1 = len(stanza_tokenizer(word1))
|
92 |
+
len2 = len(stanza_tokenizer(word2))
|
93 |
if len1 > len2:
|
94 |
ugram = len1
|
95 |
lgram = len2
|
|
|
100 |
ugram = len1
|
101 |
lgram = len1
|
102 |
|
103 |
+
v = CountVectorizer(
|
104 |
+
ngram_range=(lgram, ugram),
|
105 |
+
tokenizer=stanza_tokenizer,
|
106 |
+
lowercase=True
|
107 |
+
)
|
108 |
analyzer = v.build_analyzer()
|
109 |
vectorizer = CountVectorizer(
|
110 |
ngram_range=(lgram, ugram),
|
111 |
vocabulary={
|
112 |
analyzer(word1)[-1]: 0,
|
113 |
analyzer(word2)[-1]: 1
|
114 |
+
},
|
115 |
+
tokenizer=stanza_tokenizer,
|
116 |
+
lowercase=True
|
117 |
)
|
118 |
co_occurrences = vectorizer.fit_transform(data)
|
119 |
dense_mat = co_occurrences.todense()
|