Published
Edited
Mar 2, 2021
1 fork
Insert cell
md`# CAP 6737 - Assignment 06 Visual Cluster Analysis of Multidimensional data`
Insert cell
viewof selectDropDown = columns({
x: select({
options: irisDims,
value: 'sepal_length'
}),
y: select({
options: irisDims,
value: 'sepal_width'
}),
})
Insert cell
{
const div = DOM.element('div');
const data = [];
data.push({
x: irisSpeciesGrouped.get('setosa').map(d => d[selectDropDown.x]),
y: irisSpeciesGrouped.get('setosa').map(d => d[selectDropDown.y]),
mode: 'markers',
marker: {
size: 4,
},
name: 'setosa'
});
data.push({
x: irisSpeciesGrouped.get("versicolor").map(d => d[selectDropDown.x]),
y: irisSpeciesGrouped.get("versicolor").map(d => d[selectDropDown.y]),
mode: 'markers',
marker: {
size: 4,
},
name: 'versicolor'
});
data.push({
x: irisSpeciesGrouped.get("virginica").map(d => d[selectDropDown.x]),
y: irisSpeciesGrouped.get("virginica").map(d => d[selectDropDown.y]),
mode: 'markers',
marker: {
size: 4,
},
name: 'virginica'
});
const layout = {
title: `Scatter Plot of Attribute ${selectDropDown.x} vs ${selectDropDown.y} `,
xaxis: {
autorange: true,
title: `${selectDropDown.x}`
},
yaxis: {
autorange: true,
title: `${selectDropDown.y}`
}
};

Plotly.newPlot(div, data, layout);
return div;
}
Insert cell
viewof clusterIntitializationMethod = radio({
options: ["Random","Forgy", "Random Partition", "K-Mean++"],
value: "Forgy"
})
Insert cell
viewof clusterOutlineShape = radio({
options: ["polygon","ellipse"],
value: "polygon"
})
Insert cell
{
const myDiv = html`<div style="width:800px;height:550px;"></div>`;
const XY = iris.map(d => [d.sepal_length, d.petal_length]);
const k = 3;

//const { clusters, clusterCenters } = computeKmeanCluster(nClusters, xyData);
let clusterCenters = initializeClusterCenters(
k,
XY,
clusterIntitializationMethod
);
let clusters;

const colors = ["steelblue", "chocolate"]; // differnt color for each cluster
for (let i = 0; i < nIterations; i++) {
// Associate data points to cluster
clusters = clusterCenters.map(_ => []);
XY.forEach(d => {
const distanceToClusterCenters = clusterCenters.map(c =>
squaredDistance(c, d)
);
// Choose the closest center
clusters[d3.minIndex(distanceToClusterCenters)].push(d);
});
// Compute New Cluster center
for (let j = 0; j < k; j++)
clusterCenters[j] = updateClusterCenter(clusterCenters[j], clusters[j]);

const data = clusters.map((cluster, i) => {
// generate Trace data for the clusters
return {
x: cluster.map(d => d[0]),
y: cluster.map(d => d[1]),
mode: 'markers',
marker: {
size: 4,
color: colors[i]
}
};
});
const layout = {
showlegend: false,
autosize: false,
width: 800,
height: 550,
margin: { t: 50 },
hovermode: 'closest',
xaxis: {
title: 'sepal_length',
showgrid: false,
zeroline: false
},
yaxis: {
title: 'petal_length',
showgrid: false,
zeroline: false
}
};
layout.shapes = clusters.map((cluster, i) => {
const convexHull = d3.polygonHull(cluster);
if (clusterOutlineShape == "polygon") {
const path = d3.line().curve(d3.curveCatmullRomClosed)(convexHull);
return {
type: 'path',
path: path,
fillcolor: 'rgba(128, 128, 128, 0.4)',
line: {
color: "black"
}
};
} else {
const deltaX =
0.6 *
d3.max(convexHull.map(p => Math.abs(p[0] - clusterCenters[i][0])));
const deltaY =
0.6 *
d3.max(convexHull.map(p => Math.abs(p[1] - clusterCenters[i][1])));
return {
type: 'circle',
xref: 'x',
yref: 'y',
x0: clusterCenters[i][0] - deltaX,
y0: clusterCenters[i][1] - deltaY,
x1: clusterCenters[i][0] + deltaX,
y1: clusterCenters[i][1] + deltaY,
//opacity: 0.2,
fillcolor: 'rgba(128, 128, 128, 0.4)',
line: {
color: 'black'
}
};
}
});
Plotly.newPlot(myDiv, data, layout, { staticPlot: true });
await Promises.delay(1000);
yield myDiv;
}
}
Insert cell
md`## Imports`
Insert cell
import { soFetch } from '@alecglassford/so-fetch'
Insert cell
d3 = require("d3@6", "d3-regression@1.3.4")
Insert cell
Plotly = require("plotly.js-dist")
Insert cell
import { radio, select, slider } from "@jashkenas/inputs"
Insert cell
import { columns } from "@bcardiff/observable-columns"
Insert cell
md`## Load the Data`
Insert cell
iris = d3.csvParse(
await (await soFetch(
"https://raw.githubusercontent.com/plotly/datasets/master/iris-id.csv"
)).text(),
d3.autoType
)
Insert cell
irisSpecies = [...new Set(iris.map(d => d.species))]
Insert cell
irisDims = iris.columns.slice(0,4)
Insert cell
irisSpeciesGrouped = d3.group(iris, d => d['species']);
Insert cell
initializeClusterCenters = (k, XY, method) => {
const N = XY.length;
if (k == 0 || k > N) return null;
const clusterCenters = XY.slice(0, k).slice();

switch (method) {
case "Random":
const dim = XY[0].length;
for (let j = 0; j < k; j++) {
for (let i = 0; i < dim; i++)
clusterCenters[j][i] = d3.randomUniform(
...d3.extent(XY, d => d[i])
)();
}
break;
case "Forgy":
// Choose k distinct items from XY array using Reservoir Sampler.
// See algorithm: https://www.geeksforgeeks.org/reservoir-sampling1)
// Step I: copy first k items of data to clusterCenters.
for (let i = k; i < N; i++) {
// a) Generate a random number from 0 to i where i is index of current item in data.
// Let the generated random number is j.
const j = d3.randomInt(i + 1)();
// b) If j is in range 0 to k-1, replace reservoir[j] with arr[i]
if (j < k) clusterCenters[j] = XY[i].slice();
}
break;
case "Random Partition":
// Randomly assign a partition to each element of XY.
const clusterIndices = XY.map(_ => d3.randomInt(k)());
const lengths = [0, 0];
for (let j = 0; j < k; j++) {
const cluster = clusterIndices
.filter(index => index == j)
.map(index => XY[index]);
if (cluster.length > 0)
clusterCenters[j] = updateClusterCenter(clusterCenters[j], cluster);
}
break;
case "K-Mean++":
//Randomly select the first cluster Center from the data points.
clusterCenters[0] = XY[d3.randomInt(k)()].slice();
for (let j = 1; j < k; j++) {
//For each data point compute its distance from the nearest, previously chosen centroid.
const nearestDistances = XY.map(xy =>
d3.min(
d3
.range(j)
.map(index =>
Math.sqrt(squaredDistance(xy, clusterCenters[index]))
)
)
);
const sumNearestDistances = d3.sum(nearestDistances);
const probabilities = nearestDistances.map(
d => d / sumNearestDistances
);
const cumProbabilities = probabilities.slice();
for (let i = 1; i < N; i++)
cumProbabilities[i] += cumProbabilities[i - 1];
// Select the next centers from the data points such that the probability of choosing a point as centroid is
// directly proportional to its distance from the nearest, previously chosen centroid.
// the point having maximum distance from the nearest centroid is most likely to be selected next as a centroid

const x = Math.random();
const sampleIndex = d3.bisect(cumProbabilities, x);
mutable feedback = sampleIndex;

clusterCenters[j] = XY[sampleIndex].slice();
}
break;
}
return clusterCenters;
}
Insert cell
updateClusterCenter = (center, cluster) => {
for (let i = 0; i < center.length; i++)
center[i] = d3.mean(cluster, d => d[i]);
return center;
}
Insert cell
squaredDistance = (p, q) =>
d3.sum(p.map((_, i) => (p[i] - q[i]) * (p[i] - q[i])))
Insert cell
mutable feedback = ""
Insert cell
nIterations = 5
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more