Preprocessing
Browse files
app.py
CHANGED
@@ -15,91 +15,21 @@ HfFolder.save_token(HF_TOKEN)
|
|
15 |
|
16 |
|
17 |
datasets = {
|
18 |
-
"stars": load_dataset("open-source-metrics/
|
19 |
-
"issues": load_dataset("open-source-metrics/
|
20 |
"pip": load_dataset("open-source-metrics/pip").sort('day'),
|
21 |
}
|
22 |
|
23 |
external_datasets = {
|
24 |
-
"stars": load_dataset("open-source-metrics/stars-external").sort('dates'),
|
25 |
-
"issues": load_dataset("open-source-metrics/issues-external").sort('dates'),
|
26 |
"pip": load_dataset("open-source-metrics/pip-external").sort('day')
|
27 |
}
|
28 |
|
29 |
-
val = 0
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
current_date = datetime.strptime(e['dates'], "%Y-%m-%dT%H:%M:%SZ")
|
38 |
-
first_date = datetime.fromtimestamp(1)
|
39 |
-
week = abs(current_date - first_date).days // 7
|
40 |
-
e['week'] = week
|
41 |
-
|
42 |
-
return e
|
43 |
-
|
44 |
-
|
45 |
-
def _ignore_org_members(e):
|
46 |
-
global val
|
47 |
-
e['range_non_org'] = val
|
48 |
-
|
49 |
-
if e['type']['authorAssociation'] != 'MEMBER':
|
50 |
-
val += 1
|
51 |
-
|
52 |
-
return e
|
53 |
-
|
54 |
-
|
55 |
-
stars = {}
|
56 |
-
for k, v in datasets['stars'].items():
|
57 |
-
stars[k] = v.map(_range)
|
58 |
-
val = 0
|
59 |
-
|
60 |
-
stars_external = {}
|
61 |
-
for k, v in external_datasets['stars'].items():
|
62 |
-
stars_external[k] = v.map(_range)
|
63 |
-
val = 0
|
64 |
-
|
65 |
-
|
66 |
-
issues = {}
|
67 |
-
for k, v in datasets['issues'].items():
|
68 |
-
issues[k] = v.map(_range)
|
69 |
-
val = 0
|
70 |
-
issues[k] = issues[k].map(_ignore_org_members)
|
71 |
-
val = 0
|
72 |
-
|
73 |
-
|
74 |
-
issues_external = {}
|
75 |
-
for k, v in external_datasets['issues'].items():
|
76 |
-
issues_external[k] = v.map(_range)
|
77 |
-
val = 0
|
78 |
-
issues_external[k] = issues_external[k].map(_ignore_org_members)
|
79 |
-
val = 0
|
80 |
-
|
81 |
-
datasets['stars'] = DatasetDict(**stars)
|
82 |
-
datasets['issues'] = DatasetDict(**issues)
|
83 |
-
external_datasets['stars'] = DatasetDict(**stars_external)
|
84 |
-
external_datasets['issues'] = DatasetDict(**issues_external)
|
85 |
-
|
86 |
-
|
87 |
-
def link_values(library_names, returned_values):
|
88 |
-
previous_values = {library_name: None for library_name in library_names}
|
89 |
-
for library_name in library_names:
|
90 |
-
for i in returned_values.keys():
|
91 |
-
if library_name not in returned_values[i]:
|
92 |
-
returned_values[i][library_name] = previous_values[library_name]
|
93 |
-
else:
|
94 |
-
previous_values[library_name] = returned_values[i][library_name]
|
95 |
-
|
96 |
-
return returned_values
|
97 |
-
|
98 |
-
|
99 |
-
def running_mean(x, N, total_length=-1):
|
100 |
-
cumsum = np.cumsum(np.insert(x, 0, 0))
|
101 |
-
to_pad = max(total_length - len(cumsum), 0)
|
102 |
-
return np.pad(cumsum[N:] - cumsum[:-N], (to_pad, 0)) / float(N)
|
103 |
|
104 |
|
105 |
def parse_name_and_options(path):
|
@@ -152,10 +82,12 @@ class RequestHandler(SimpleHTTPRequestHandler):
|
|
152 |
external_dataset_with_most_splits = list(external_dataset_with_most_splits)
|
153 |
external_dataset_with_most_splits.sort()
|
154 |
|
|
|
|
|
155 |
res = {
|
156 |
'internal': dataset_with_most_splits,
|
157 |
'external': external_dataset_with_most_splits,
|
158 |
-
'warnings':
|
159 |
}
|
160 |
|
161 |
print(f"Returning: {res}")
|
@@ -215,90 +147,29 @@ class RequestHandler(SimpleHTTPRequestHandler):
|
|
215 |
return self.response(output)
|
216 |
|
217 |
if self.path.startswith("/retrieveStars"):
|
218 |
-
errors = []
|
219 |
library_names, options = parse_name_and_options(self.path)
|
220 |
-
returned_values = {}
|
221 |
-
dataset_dict = datasets['stars']
|
222 |
-
external_dataset_dict = external_datasets['stars']
|
223 |
week_over_week = '1' in options
|
224 |
|
225 |
-
|
226 |
-
if
|
227 |
-
|
228 |
-
|
229 |
-
dataset = external_dataset_dict[library_name]
|
230 |
-
else:
|
231 |
-
errors.append(f"No {library_name} found in internal or external datasets for stars.")
|
232 |
-
return {'errors': errors}
|
233 |
-
|
234 |
-
last_value = 0
|
235 |
-
last_week = dataset[0]['week']
|
236 |
-
for i in dataset:
|
237 |
-
if week_over_week and last_week == i['week']:
|
238 |
-
continue
|
239 |
-
if i['dates'] in returned_values:
|
240 |
-
returned_values[i['dates']][library_name] = i['range'] - last_value
|
241 |
-
else:
|
242 |
-
returned_values[i['dates']] = {library_name: i['range'] - last_value}
|
243 |
-
|
244 |
-
last_value = i['range'] if week_over_week else 0
|
245 |
-
last_week = i['week']
|
246 |
-
|
247 |
-
returned_values = collections.OrderedDict(sorted(returned_values.items()))
|
248 |
-
returned_values = link_values(library_names, returned_values)
|
249 |
-
output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names}
|
250 |
-
output['day'] = list(returned_values.keys())[::-1]
|
251 |
-
|
252 |
-
# Trim down to a smaller number of points.
|
253 |
-
output = {k: [v for i, v in enumerate(value) if i % max(1, int(len(value) / 100)) == 0] for k, value in output.items()}
|
254 |
-
|
255 |
-
return self.response(output)
|
256 |
-
|
257 |
|
258 |
if self.path.startswith("/retrieveIssues"):
|
259 |
-
errors = []
|
260 |
library_names, options = parse_name_and_options(self.path)
|
261 |
-
|
262 |
exclude_org_members = '1' in options
|
263 |
week_over_week = '2' in options
|
264 |
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
range_id = 'range' if not exclude_org_members else 'range_non_org'
|
269 |
-
|
270 |
-
for library_name in library_names:
|
271 |
-
if library_name in dataset_dict:
|
272 |
-
dataset = dataset_dict[library_name]
|
273 |
-
elif library_name in external_dataset_dict:
|
274 |
-
dataset = external_dataset_dict[library_name]
|
275 |
else:
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
if week_over_week and last_week == i['week']:
|
283 |
-
continue
|
284 |
-
|
285 |
-
if i['dates'] in returned_values:
|
286 |
-
returned_values[i['dates']][library_name] = i[range_id] - last_value
|
287 |
-
else:
|
288 |
-
returned_values[i['dates']] = {library_name: i[range_id] - last_value}
|
289 |
-
|
290 |
-
last_value = i[range_id] if week_over_week else 0
|
291 |
-
last_week = i['week']
|
292 |
-
|
293 |
-
returned_values = collections.OrderedDict(sorted(returned_values.items()))
|
294 |
-
returned_values = link_values(library_names, returned_values)
|
295 |
-
output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names}
|
296 |
-
output['day'] = list(returned_values.keys())[::-1]
|
297 |
-
|
298 |
-
# Trim down to a smaller number of points.
|
299 |
-
output = {k: [v for i, v in enumerate(value) if i % max(1, int(len(value) / 100)) == 0] for k, value in output.items()}
|
300 |
-
|
301 |
-
return self.response(output)
|
302 |
|
303 |
return SimpleHTTPRequestHandler.do_GET(self)
|
304 |
|
|
|
15 |
|
16 |
|
17 |
datasets = {
|
18 |
+
"stars": load_dataset("open-source-metrics/preprocessed_stars"),
|
19 |
+
"issues": load_dataset("open-source-metrics/preprocessed_issues"),
|
20 |
"pip": load_dataset("open-source-metrics/pip").sort('day'),
|
21 |
}
|
22 |
|
23 |
external_datasets = {
|
|
|
|
|
24 |
"pip": load_dataset("open-source-metrics/pip-external").sort('day')
|
25 |
}
|
26 |
|
|
|
27 |
|
28 |
+
def cut_output(full_output: Dataset, library_names: list):
|
29 |
+
output = full_output.to_dict().items()
|
30 |
+
output = {k: v + [None] for k, v in output if k in library_names + ['day']}
|
31 |
+
last_value = max(output[k].index(None) for k in output.keys() if k != 'day')
|
32 |
+
return {k: v[:last_value] for k, v in output.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
|
35 |
def parse_name_and_options(path):
|
|
|
82 |
external_dataset_with_most_splits = list(external_dataset_with_most_splits)
|
83 |
external_dataset_with_most_splits.sort()
|
84 |
|
85 |
+
warnings.append("Selecting PyTorch and/or TensorFlow will take a while to compute, and may timeout for issues/PRs..")
|
86 |
+
|
87 |
res = {
|
88 |
'internal': dataset_with_most_splits,
|
89 |
'external': external_dataset_with_most_splits,
|
90 |
+
'warnings': []
|
91 |
}
|
92 |
|
93 |
print(f"Returning: {res}")
|
|
|
147 |
return self.response(output)
|
148 |
|
149 |
if self.path.startswith("/retrieveStars"):
|
|
|
150 |
library_names, options = parse_name_and_options(self.path)
|
|
|
|
|
|
|
151 |
week_over_week = '1' in options
|
152 |
|
153 |
+
if week_over_week:
|
154 |
+
return self.response({k: v for k, v in datasets['stars']['wow'].to_dict().items() if k in library_names + ['day']})
|
155 |
+
else:
|
156 |
+
return self.response({k: v for k, v in datasets['stars']['wow'].to_dict().items() if k in library_names + ['day']})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
if self.path.startswith("/retrieveIssues"):
|
|
|
159 |
library_names, options = parse_name_and_options(self.path)
|
|
|
160 |
exclude_org_members = '1' in options
|
161 |
week_over_week = '2' in options
|
162 |
|
163 |
+
if week_over_week:
|
164 |
+
if exclude_org_members:
|
165 |
+
return self.response(cut_output(datasets['issues']['eom_wow'], library_names))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
else:
|
167 |
+
return self.response({k: v for k, v in datasets['issues']['wow'].to_dict().items() if k in library_names + ['day']})
|
168 |
+
else:
|
169 |
+
if exclude_org_members:
|
170 |
+
return self.response({k: v for k, v in datasets['issues']['eom'].to_dict().items() if k in library_names + ['day']})
|
171 |
+
else:
|
172 |
+
return self.response({k: v for k, v in datasets['issues']['raw'].to_dict().items() if k in library_names + ['day']})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
return SimpleHTTPRequestHandler.do_GET(self)
|
175 |
|
index.js
CHANGED
@@ -122,16 +122,16 @@ const initialize = async () => {
|
|
122 |
graphSelector.appendChild(graphSpan);
|
123 |
|
124 |
if (inferJson.warnings.length > 0) {
|
125 |
-
const div = document.createElement('div');
|
126 |
-
div.classList.add('warning-div')
|
127 |
-
|
128 |
for (const warning of inferJson.warnings) {
|
|
|
|
|
|
|
129 |
const labelSpan = document.createElement('span');
|
130 |
labelSpan.textContent = `Warning: ${warning}`;
|
131 |
|
132 |
div.appendChild(labelSpan);
|
|
|
133 |
}
|
134 |
-
warnings.appendChild(div);
|
135 |
}
|
136 |
|
137 |
for (const element of inferJson.internal) {
|
|
|
122 |
graphSelector.appendChild(graphSpan);
|
123 |
|
124 |
if (inferJson.warnings.length > 0) {
|
|
|
|
|
|
|
125 |
for (const warning of inferJson.warnings) {
|
126 |
+
const div = document.createElement('div');
|
127 |
+
div.classList.add('warning-div')
|
128 |
+
|
129 |
const labelSpan = document.createElement('span');
|
130 |
labelSpan.textContent = `Warning: ${warning}`;
|
131 |
|
132 |
div.appendChild(labelSpan);
|
133 |
+
warnings.appendChild(div);
|
134 |
}
|
|
|
135 |
}
|
136 |
|
137 |
for (const element of inferJson.internal) {
|