xet-repo-dedupe / index.html
znation's picture
znation HF Staff
styling
0b21dfd
raw
history blame
1.84 kB
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width" />
<title>xet-repo-dedupe</title>
<link rel="stylesheet" href="style.css" />
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
<style>
#vis {
width: 100%;
text-align: center;
}
</style>
</head>
<body>
<div class="card">
<h1>Visualizing Repo-level Dedupe</h1>
<p>This visualization demonstrates the amount of <a href="https://huggingface.co/blog/from-files-to-chunks">chunk-level dedupe</a> within a repo or across a selection of repos. (For now, demonstrates a hardcoded selection.)</p>
</div>
<div id="vis"></div>
<script>
var vlSpec = {
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"resolve": {"scale": {"x": "independent"}},
"width": 600,
"height": 12,
"data": {
"url": "xorbs.json"
},
"mark": "rect",
"encoding": {
"x": {
"field": "xorb_id",
"axis": null,
"sort": {"field": "dedupe_factor", "order": "descending"},
"stack": "normalize"
},
"color": {
"field": "dedupe_factor",
"type": "quantitative",
"scale": {"domain": [0, 10]}
},
"tooltip": {"field": "dedupe_factor"},
"row": {
"field": "repo",
"spacing": 1,
"header": {"labelAngle": 0, "labelAlign": "left"},
"sort": {"field": "dedupe_factor", "order": "descending"}
}
}
};
vegaEmbed('#vis', vlSpec);
</script>
</body>
</html>