|
|
|
const COLORS = [ |
|
|
|
['46', '145', '229'], |
|
['225', '95', '153'], |
|
['28', '167', '28'], |
|
['251', '13', '13'], |
|
['218', '22', '255'], |
|
|
|
['182', '129', '0'], |
|
['117', '13', '134'], |
|
['235', '102', '59'], |
|
['81', '28', '251'], |
|
['0', '160', '139'], |
|
['251', '0', '209'], |
|
['252', '0', '128'], |
|
['178', '130', '141'], |
|
['108', '124', '50'], |
|
['119', '138', '174'], |
|
['134', '42', '22'], |
|
['167', '119', '241'], |
|
['98', '0', '66'], |
|
['22', '22', '167'], |
|
['218', '96', '202'], |
|
['108', '69', '22'], |
|
|
|
['175', '0', '56'] |
|
] |
|
|
|
const getLabelHoverFormat = (row) => { |
|
return `<b>Text</b>: ${row.text}<br><b>Edu label</b>: ${row.eduScore}` |
|
} |
|
|
|
|
|
const K = 15 |
|
|
|
|
|
function createLabelOrderMapping(labels) { |
|
const labelCounts = labels.reduce((acc, label) => { |
|
acc[label] = (acc[label] || 0) + 1; |
|
return acc; |
|
}, {}); |
|
|
|
const sortedLabels = Object.entries(labelCounts).sort((a, b) => b[1] - a[1]).map(entry => entry[0]); |
|
|
|
const labelOrder = {}; |
|
sortedLabels.forEach((label, index) => { |
|
labelOrder[label] = index; |
|
}); |
|
return labelOrder; |
|
} |
|
|
|
|
|
|
|
|
|
const parseAnnotations = async (file) => { |
|
return (await readCSV(file)).filter((cluster_summary) => { |
|
return parseInt(cluster_summary.cluster_id) != -1 |
|
}).map((cluster_summary) => { |
|
return { |
|
x: parseFloat(cluster_summary.cluster_position_x), |
|
y: parseFloat(cluster_summary.cluster_position_y), |
|
label: parseInt(cluster_summary.cluster_id), |
|
text: cluster_summary.cluster_summaries, |
|
} |
|
}) |
|
} |
|
|
|
const addStylingToAnnotations = (annotations) => { |
|
return annotations.map((annotation, i) => { |
|
return { |
|
...annotation, |
|
showarrow: false, |
|
font: { |
|
size: 14, |
|
color: 'black', |
|
weight: 'bold' |
|
}, |
|
bgcolor: getColor(annotation.label, 0.9), |
|
borderpad: 2, |
|
} |
|
}) |
|
} |
|
|
|
const getRelevantAnnotations = (annotations, x0, x1, y0, y1, k=K) => { |
|
const relevant_annotations = annotations.filter((annotation) => { |
|
return annotation.x >= x0 && annotation.x <= x1 && annotation.y >= y0 && annotation.y <= y1 |
|
}) |
|
return relevant_annotations.sort((a, b) => a.ord - b.ord).slice(0, k); |
|
} |
|
|
|
|
|
const getMinMaxTracesArea = (traces) => { |
|
const x0 = Math.min(...traces.map(trace => trace.x)); |
|
const x1 = Math.max(...traces.map(trace => trace.x)); |
|
const y0 = Math.min(...traces.map(trace => trace.y)); |
|
const y1 = Math.max(...traces.map(trace => trace.y)); |
|
return {x0, x1, y0, y1}; |
|
} |
|
|
|
const readData = async () => { |
|
return (await readCSV('data/clustering/data.csv')).map(row => ({ |
|
x: parseFloat(row.X), |
|
y: parseFloat(row.Y), |
|
eduScore: parseFloat(row.edu_labels), |
|
label: parseInt(row.cluster_labels), |
|
text: row.content_display, |
|
})); |
|
} |
|
|
|
async function plotClusters() { |
|
const parent = document.getElementById('clusters-plot'); |
|
const data = await readData(); |
|
const traces = [{ |
|
type: 'scatter', |
|
mode: 'markers', |
|
x: data.map(row => row.x), |
|
y: data.map(row => row.y), |
|
marker: { |
|
color: data.map(row => getColor(row.label, 1.0)), |
|
size: 5, |
|
opacity: 8 |
|
}, |
|
hoverinfo: 'text', |
|
hovertext: data.map(row => getLabelHoverFormat(row)), |
|
hoverlabel: { |
|
bgcolor: 'white' |
|
}, |
|
}]; |
|
const labelOrder = createLabelOrderMapping(data.map(row => row.label)); |
|
const annotations = (await parseAnnotations('data/clustering/info.csv')).map( |
|
(annot) => { |
|
return { |
|
...annot, |
|
ord: labelOrder[annot.label] |
|
} |
|
} |
|
); |
|
|
|
const {x0, x1, y0, y1} = getMinMaxTracesArea(data); |
|
const layout = { |
|
height: 550, |
|
width: parent.clientWidth, |
|
xaxis: { |
|
showticklabels: false, |
|
showgrid: false, |
|
zeroline: false, |
|
title: { |
|
text: "Fineweb dataset (clustered using TODO and labeled using TODO),<br> zoom in to see more", |
|
font: { |
|
size: 16, |
|
style: 'italic' |
|
}, |
|
|
|
}, |
|
}, |
|
yaxis: { |
|
showticklabels: false, |
|
showgrid: false, |
|
zeroline: false, |
|
}, |
|
annotations: addStylingToAnnotations(getRelevantAnnotations(annotations, x0, x1, y0, y1)), |
|
font: { |
|
family: "apple-system, Arial, sans-serif", |
|
}, |
|
margin: { |
|
t: 0, |
|
b: 30, |
|
}, |
|
}; |
|
|
|
Plotly.newPlot(parent, traces, layout); |
|
|
|
parent.on('plotly_relayout', (eventdata) => { |
|
|
|
if (eventdata["xaxis.range[0]"]) { |
|
const [x0, x1] = [eventdata['xaxis.range[0]'], eventdata['xaxis.range[1]']]; |
|
const [y0, y1] = [eventdata['yaxis.range[0]'], eventdata['yaxis.range[1]']]; |
|
|
|
const relevant_annotations = getRelevantAnnotations(annotations, x0, x1, y0, y1); |
|
Plotly.relayout(parent, {...layout, annotations: addStylingToAnnotations(relevant_annotations)}); |
|
} |
|
|
|
else if (eventdata["xaxis.autorange"]){ |
|
const {x0, x1, y0, y1} = getMinMaxTracesArea(data); |
|
const relevant_annotations = getRelevantAnnotations(annotations, x0, x1, y0, y1); |
|
Plotly.relayout(parent, {...layout, annotations: addStylingToAnnotations(relevant_annotations)}); |
|
} |
|
|
|
}); |
|
|
|
window.addEventListener("resize", () => { |
|
|
|
if (window.innerWidth < 768) { |
|
return; |
|
} |
|
Plotly.relayout(parent, { |
|
width: parent.offsetWidth, |
|
}); |
|
}); |
|
} |
|
|
|
document.addEventListener("DOMContentLoaded", () => { |
|
plotClusters(); |
|
}); |
|
|
|
|
|
const readCSV = async (file) => { |
|
const data = await fetch(file) |
|
const text = await data.text() |
|
const csv = Papa.parse(text, {header: true}); |
|
return csv.data; |
|
} |
|
|
|
|
|
|
|
const getColor = (i, opacity) => { |
|
if (i < 0) { |
|
i = i * -1 |
|
} |
|
console.log(COLORS[i % COLORS.length]) |
|
return `rgba(${COLORS[i % COLORS.length].join(',')}, ${opacity})` |
|
} |