xet-repo-dedupe / index.html
znation's picture
znation HF staff
Clarify interactivity
67629b7
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width" />
<title>xet-repo-dedupe</title>
<link rel="stylesheet" href="style.css" />
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
<style>
#vis {
width: 100%;
text-align: center;
}
</style>
</head>
<body>
<div class="card">
<h1>Visualizing Repo-level Dedupe</h1>
<p>This visualization demonstrates the amount of <a target="_blank" rel="noopener noreferrer" href="https://huggingface.co/blog/from-files-to-chunks">chunk-level dedupe</a> across all public repos.</p>
<p>"Dedupe factor" is defined as the number of re-uses of a given "xorb". A "xorb" is a collection of content-defined chunks, typically around 1,000 chunks comprising up to 64 MB of total data.</p>
<p>Interactions:
<ul>
<li>
Hover to select a xorb, and highlight the same xorb in all other repos in <strong><span style="color: red">red</span></strong>.
</li>
<li>
Click to select a row (repo), and fade out all repos that don't contain any overlapping data. Double-click to clear selection.
</li>
</ul>
</p>
</div>
<div id="vis"></div>
<script>
var vlSpec = {
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"resolve": {"scale": {"x": "independent"}},
"width": 600,
"height": 12,
"params": [
{
"name": "highlight",
"select": {"type": "point", "fields": ["xorb_id"], "on": "pointerover"}
},
{
"name": "select",
"select": {"type": "point", "fields": ["repo"], "toggle": "false"}
},
{
"name": "xorbs_selected",
"expr": "pluck(data('source_0'), 'repo_xorb_selected')"
},
{"name": "any_xorbs_selected", "expr": "extent(xorbs_selected)[0] != null"}
],
"transform": [
{
"calculate": "(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1",
"as": "repo_selected"
},
{
"calculate": "if(datum.repo_selected > 0, datum.xorb_id, null)",
"as": "repo_xorb_selected"
}
],
"data": {
"url": "xorbs.json"
},
"mark": "rect",
"encoding": {
"x": {
"field": "xorb_id",
"axis": null,
"sort": {"field": "dedupe_factor", "order": "descending"},
"stack": "normalize"
},
"color": {
"condition": [
{"test": "datum.xorb_id == highlight.xorb_id", "value": "red"}
],
"field": "dedupe_factor",
"type": "quantitative",
"scale": {"domain": [0, 10]}
},
"opacity": {
"condition": [
{
"test": "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1",
"value": 0.2
}
]
},
"tooltip": {"field": "dedupe_factor"},
"row": {
"field": "repo",
"spacing": 1,
"header": {"labelAngle": 0, "labelAlign": "left"},
"sort": {"field": "dedupe_factor", "order": "descending"}
}
}
};
vegaEmbed('#vis', vlSpec);
</script>
</body>
</html>