Published
Edited
Feb 22, 2021
Insert cell
md`# Visual Cluster Analysis
Uses [K-mean algorithm](https://en.wikipedia.org/wiki/K-means_clustering) and for a user defined *k*.
The convex hull/ellipse of the clusters is shown.`
Insert cell
md`### Scatter Plot: Old FaithFul Data`
Insert cell
{
const myDiv = html`<div style="width:800px;height:550px;"></div>`;
const xData = oldFaithful.map(d => d.waiting);
const yData = oldFaithful.map(d => d.eruptions);

const trace1 = {
x: xData,
y: yData,
mode: 'markers',
marker: {
size: 4
}
};
const layout = {
showlegend: false,
autosize: false,
width: 800,
height: 550,
margin: { t: 50 },
hovermode: 'closest',
xaxis: {
title: 'Waiting Time (minutes)',
showgrid: false,
zeroline: false
},
yaxis: {
title: 'Eruption Duration (minutes)',
showgrid: false,
zeroline: false
}
};
const data = [trace1];
Plotly.newPlot(myDiv, data, layout);
return myDiv;
}
Insert cell
nIterations = 5
Insert cell
feedback
Insert cell
Insert cell
Insert cell
viewof clusterOutlineShape = radio({
options: ["polygon","ellipse"],
value: "polygon"
})
Insert cell
{
const myDiv = html`<div style="width:800px;height:550px;"></div>`;
const xyData = oldFaithful.map(d => [d.waiting, d.eruptions]);
const nClusters = 2;
//const xyData = iris.map(d => [d.sepal_length, d.sepal_width]);
//const nClusters = 3;

const { clusters, clusterCenters } = computeKmeanCluster(nClusters, xyData);

const colors = ["steelblue", "chocolate"]; // differnt color for each cluster
const data = clusters.map((cluster, i) => {
// generate Trace data for the clusters
return {
x: cluster.map(d => d[0]),
y: cluster.map(d => d[1]),
mode: 'markers',
marker: {
size: 4,
color: colors[i]
}
};
});
const layout = {
showlegend: false,
autosize: false,
width: 800,
height: 550,
margin: { t: 50 },
hovermode: 'closest',
xaxis: {
title: 'Waiting Time (minutes)',
range: [30, 100],
showgrid: false,
zeroline: false
},
yaxis: {
title: 'Eruption Duration (minutes)',
range: [1, 6],
showgrid: false,
zeroline: false
}
};
layout.shapes = clusters.map((cluster, i) => {
const convexHull = d3.polygonHull(cluster);
if (clusterOutlineShape == "polygon") {
const path = d3.line().curve(d3.curveCatmullRomClosed)(convexHull);
return {
type: 'path',
path: path,
fillcolor: 'rgba(128, 128, 128, 0.4)',
line: {
color: "black"
}
};
} else {
const deltaX =
0.6 *
d3.max(convexHull.map(p => Math.abs(p[0] - clusterCenters[i][0])));
const deltaY =
0.6 *
d3.max(convexHull.map(p => Math.abs(p[1] - clusterCenters[i][1])));
return {
type: 'circle',
xref: 'x',
yref: 'y',
x0: clusterCenters[i][0] - deltaX,
y0: clusterCenters[i][1] - deltaY,
x1: clusterCenters[i][0] + deltaX,
y1: clusterCenters[i][1] + deltaY,
//opacity: 0.2,
fillcolor: 'rgba(128, 128, 128, 0.4)',
line: {
color: 'black'
}
};
}
});
Plotly.newPlot(myDiv, data, layout, { staticPlot: true });
return myDiv;
}
Insert cell
mutable feedback = ""
Insert cell
computeKmeanCluster = (k, XY) => {
let clusterCenters = initializeClusterCenters(
k,
XY,
clusterIntitializationMethod
);
let clusters;
for (let i = 0; i < nIterations; i++) {
// Associate data points to cluster
clusters = clusterCenters.map(_ => []);
XY.forEach(d => {
const distanceToClusterCenters = clusterCenters.map(c =>
squaredDistance(c, d)
);
// Choose the closest center
clusters[d3.minIndex(distanceToClusterCenters)].push(d);
});
// Compute New Cluster center
for (let j = 0; j < k; j++)
clusterCenters[j] = updateClusterCenter(clusterCenters[j], clusters[j]);
}
return {clusterCenters,clusters};
}
Insert cell
Insert cell
Insert cell
{
const myDiv = html`<div style="width:800px;height:550px;"></div>`;
const XY = oldFaithful.map(d => [d.waiting, d.eruptions]);
const k = 2;

//const { clusters, clusterCenters } = computeKmeanCluster(nClusters, xyData);
let clusterCenters = initializeClusterCenters(
k,
XY,
clusterIntitializationMethod1
);
let clusters;

const colors = ["steelblue", "chocolate"]; // differnt color for each cluster
for (let i = 0; i < nIterations; i++) {
// Associate data points to cluster
clusters = clusterCenters.map(_ => []);
XY.forEach(d => {
const distanceToClusterCenters = clusterCenters.map(c =>
squaredDistance(c, d)
);
// Choose the closest center
clusters[d3.minIndex(distanceToClusterCenters)].push(d);
});
// Compute New Cluster center
for (let j = 0; j < k; j++)
clusterCenters[j] = updateClusterCenter(clusterCenters[j], clusters[j]);

const data = clusters.map((cluster, i) => {
// generate Trace data for the clusters
return {
x: cluster.map(d => d[0]),
y: cluster.map(d => d[1]),
mode: 'markers',
marker: {
size: 4,
color: colors[i]
}
};
});
const layout = {
showlegend: false,
autosize: false,
width: 800,
height: 550,
margin: { t: 50 },
hovermode: 'closest',
xaxis: {
title: 'Waiting Time (minutes)',
range: [30, 100],
showgrid: false,
zeroline: false
},
yaxis: {
title: 'Eruption Duration (minutes)',
range: [1, 6],
showgrid: false,
zeroline: false
}
};
layout.shapes = clusters.map((cluster, i) => {
const convexHull = d3.polygonHull(cluster);
if (clusterOutlineShape1 == "polygon") {
const path = d3.line().curve(d3.curveCatmullRomClosed)(convexHull);
return {
type: 'path',
path: path,
fillcolor: 'rgba(128, 128, 128, 0.4)',
line: {
color: "black"
}
};
} else {
const deltaX =
0.6 *
d3.max(convexHull.map(p => Math.abs(p[0] - clusterCenters[i][0])));
const deltaY =
0.6 *
d3.max(convexHull.map(p => Math.abs(p[1] - clusterCenters[i][1])));
return {
type: 'circle',
xref: 'x',
yref: 'y',
x0: clusterCenters[i][0] - deltaX,
y0: clusterCenters[i][1] - deltaY,
x1: clusterCenters[i][0] + deltaX,
y1: clusterCenters[i][1] + deltaY,
//opacity: 0.2,
fillcolor: 'rgba(128, 128, 128, 0.4)',
line: {
color: 'black'
}
};
}
});
Plotly.newPlot(myDiv, data, layout, { staticPlot: true });
await Promises.delay(1000);
yield myDiv;
}
}
Insert cell
squaredDistance = (p, q) =>
d3.sum(p.map((_, i) => (p[i] - q[i]) * (p[i] - q[i])))
Insert cell
updateClusterCenter = (center, cluster) => {
for (let i = 0; i < center.length; i++)
center[i] = d3.mean(cluster, d => d[i]);
return center;
}
Insert cell
feedback
Insert cell
d3.randomUniform(...[1, 5])()
Insert cell
initializeClusterCenters = (k, XY, method) => {
const N = XY.length;
if (k == 0 || k > N) return null;
const clusterCenters = XY.slice(0, k).slice();

switch (method) {
case "Random":
const dim = XY[0].length;
for (let j = 0; j < k; j++) {
for (let i = 0; i < dim; i++)
clusterCenters[j][i] = d3.randomUniform(
...d3.extent(XY, d => d[i])
)();
}
break;
case "Forgy":
// Choose k distinct items from XY array using Reservoir Sampler.
// See algorithm: https://www.geeksforgeeks.org/reservoir-sampling1)
// Step I: copy first k items of data to clusterCenters.
for (let i = k; i < N; i++) {
// a) Generate a random number from 0 to i where i is index of current item in data.
// Let the generated random number is j.
const j = d3.randomInt(i + 1)();
// b) If j is in range 0 to k-1, replace reservoir[j] with arr[i]
if (j < k) clusterCenters[j] = XY[i].slice();
}
break;
case "Random Partition":
// Randomly assign a partition to each element of XY.
const clusterIndices = XY.map(_ => d3.randomInt(k)());
const lengths = [0, 0];
for (let j = 0; j < k; j++) {
const cluster = clusterIndices
.filter(index => index == j)
.map(index => XY[index]);
if (cluster.length > 0)
clusterCenters[j] = updateClusterCenter(clusterCenters[j], cluster);
}
break;
case "K-Mean++":
//Randomly select the first cluster Center from the data points.
clusterCenters[0] = XY[d3.randomInt(k)()].slice();
for (let j = 1; j < k; j++) {
//For each data point compute its distance from the nearest, previously chosen centroid.
const nearestDistances = XY.map(xy =>
d3.min(
d3
.range(j)
.map(index =>
Math.sqrt(squaredDistance(xy, clusterCenters[index]))
)
)
);
const sumNearestDistances = d3.sum(nearestDistances);
const probabilities = nearestDistances.map(
d => d / sumNearestDistances
);
const cumProbabilities = probabilities.slice();
for (let i = 1; i < N; i++)
cumProbabilities[i] += cumProbabilities[i - 1];
// Select the next centers from the data points such that the probability of choosing a point as centroid is
// directly proportional to its distance from the nearest, previously chosen centroid.
// the point having maximum distance from the nearest centroid is most likely to be selected next as a centroid

const x = Math.random();
const sampleIndex = d3.bisect(cumProbabilities, x);
mutable feedback = sampleIndex;

clusterCenters[j] = XY[sampleIndex].slice();
}
break;
}
return clusterCenters;
}
Insert cell
d3.range(2)
Insert cell
d3.randomInt(4 + 1)()
Insert cell
md`## Data`
Insert cell
oldFaithful = d3.tsv(
"https://gist.githubusercontent.com/mbostock/e3f4376d54e02d5d43ae32a7cf0e6aa9/raw/dcb23e8f6eefdbc4ada97d6eda22b2a4f256c263/faithful.tsv",
d3.autoType
)
Insert cell
iris = d3.csv(
"https://raw.githubusercontent.com/plotly/datasets/master/iris-id.csv", //"https://datahub.io/machine-learning/iris/r/iris.csv"
d3.autoType
)
Insert cell
md`## External Libraries and Imports`
Insert cell
import { radio, select, slider } from "@jashkenas/inputs"
Insert cell
import { columns } from "@bcardiff/observable-columns"
Insert cell
Plotly = require("plotly.js-dist")
Insert cell
d3= require ("d3@6")
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more