Spaces:
Running
Running
<html> | |
<head> | |
<meta charset="utf-8" /> | |
<meta name="viewport" content="width=device-width" /> | |
<title>xet-repo-dedupe</title> | |
<link rel="stylesheet" href="style.css" /> | |
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script> | |
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script> | |
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script> | |
<style> | |
#vis { | |
width: 100%; | |
text-align: center; | |
} | |
</style> | |
</head> | |
<body> | |
<div class="card"> | |
<h1>Visualizing Repo-level Dedupe</h1> | |
<p>This visualization demonstrates the amount of <a target="_blank" rel="noopener noreferrer" href="https://huggingface.co/blog/from-files-to-chunks">chunk-level dedupe</a> across all public repos.</p> | |
<p>"Dedupe factor" is defined as the number of re-uses of a given "xorb". A "xorb" is a collection of content-defined chunks, typically around 1,000 chunks comprising up to 64 MB of total data.</p> | |
<p>Interactions: | |
<ul> | |
<li> | |
Hover to select a xorb, and highlight the same xorb in all other repos in <strong><span style="color: red">red</span></strong>. | |
</li> | |
<li> | |
Click to select a row (repo), and fade out all repos that don't contain any overlapping data. Double-click to clear selection. | |
</li> | |
</ul> | |
</p> | |
</div> | |
<div id="vis"></div> | |
<script> | |
var vlSpec = { | |
"$schema": "https://vega.github.io/schema/vega-lite/v5.json", | |
"resolve": {"scale": {"x": "independent"}}, | |
"width": 600, | |
"height": 12, | |
"params": [ | |
{ | |
"name": "highlight", | |
"select": {"type": "point", "fields": ["xorb_id"], "on": "pointerover"} | |
}, | |
{ | |
"name": "select", | |
"select": {"type": "point", "fields": ["repo"], "toggle": "false"} | |
}, | |
{ | |
"name": "xorbs_selected", | |
"expr": "pluck(data('source_0'), 'repo_xorb_selected')" | |
}, | |
{"name": "any_xorbs_selected", "expr": "extent(xorbs_selected)[0] != null"} | |
], | |
"transform": [ | |
{ | |
"calculate": "(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1", | |
"as": "repo_selected" | |
}, | |
{ | |
"calculate": "if(datum.repo_selected > 0, datum.xorb_id, null)", | |
"as": "repo_xorb_selected" | |
} | |
], | |
"data": { | |
"url": "xorbs.json" | |
}, | |
"mark": "rect", | |
"encoding": { | |
"x": { | |
"field": "xorb_id", | |
"axis": null, | |
"sort": {"field": "dedupe_factor", "order": "descending"}, | |
"stack": "normalize" | |
}, | |
"color": { | |
"condition": [ | |
{"test": "datum.xorb_id == highlight.xorb_id", "value": "red"} | |
], | |
"field": "dedupe_factor", | |
"type": "quantitative", | |
"scale": {"domain": [0, 10]} | |
}, | |
"opacity": { | |
"condition": [ | |
{ | |
"test": "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1", | |
"value": 0.2 | |
} | |
] | |
}, | |
"tooltip": {"field": "dedupe_factor"}, | |
"row": { | |
"field": "repo", | |
"spacing": 1, | |
"header": {"labelAngle": 0, "labelAlign": "left"}, | |
"sort": {"field": "dedupe_factor", "order": "descending"} | |
} | |
} | |
}; | |
vegaEmbed('#vis', vlSpec); | |
</script> | |
</body> | |
</html> | |