Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
cars = (await require('vega-datasets@1'))['cars.json']()
Insert cell
keys = ['Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration', 'Miles_per_Gallon', 'Name', 'Origin']
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
cars_data = {
var cars_data = cars_filtered.map((row) =>
[[row.Cylinders, row.Displacement, row.Horsepower, row.Weight_in_lbs, row.Acceleration, row.Miles_per_Gallon],
row.Origin, row.Name]);
return cars_data;
}
Insert cell
Insert cell
data_split = {
var cars_shuffled = shuffleArray(cars_data);
const train_count = 0.8*cars_shuffled.length; // we will be using 80% of the data for training
var X_train = cars_shuffled.slice(0, train_count).map(x => x[0]);
var X_test = cars_shuffled.slice(train_count).map(x => x[0]);
var Y_train = cars_shuffled.slice(0, train_count).map(x => x[1]);
var Y_test = cars_shuffled.slice(train_count).map(x => x[1]);
var names_train = cars_shuffled.slice(0, train_count).map(x => x[2]);
var names_test = cars_shuffled.slice(train_count).map(x => x[2]);
return [X_train, X_test, Y_train, Y_test, names_train, names_test];
}
Insert cell
Insert cell
knn = new ML.KNN(data_split[0], data_split[2], {k:5});
Insert cell
Insert cell
knn_results = knn.predict(data_split[1])
Insert cell
knn_accuracy = calculateAccuracy(knn_results, data_split[3])
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
// A Gaussian Naive Bayes assumes that all variables are Gaussian-distributed for each class.
nb = new ML.NaiveBayes.GaussianNB()
Insert cell
// we have to convert the classes to integers first before we use the Naive Bayes or decision tree classifiers.
classes_to_ids = {
return {'USA': 0, 'Japan': 1, 'Europe': 2};
}
Insert cell
ids_to_classes = {
return {0: 'USA', 1: 'Japan', 2: 'Europe'};
}
Insert cell
nb_results = {
// exercise for the reader?
nb.train(data_split[0], data_split[2].map(x => classes_to_ids[x]));
return nb.predict(data_split[1]);
}
Insert cell
nb_accuracy = calculateAccuracy(data_split[3].map(x => classes_to_ids[x]), nb_results)
Insert cell
// There are a lot of potential options for the decision tree classifier - see https://github.com/mljs/decision-tree-cart
dt = new ML.DecisionTreeClassifier()
Insert cell
dt_results = {
// exercise for reader?
dt.train(data_split[0], data_split[2].map(x => classes_to_ids[x]));
return dt.predict(data_split[1]);
}
Insert cell
dt_accuracy = calculateAccuracy(data_split[3].map(x => classes_to_ids[x]), dt_results)
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
regression_data_split = {
var cars_data = cars_filtered.map((row) =>
[[row.Cylinders, row.Displacement, row.Horsepower, row.Weight_in_lbs], [row.Miles_per_Gallon], row.Name]);
// note: for MultivariateLinearRegression the output has to be an array
var cars_shuffled = shuffleArray(cars_data);
const train_count = 0.8*cars_shuffled.length; // we will be using 80% of the data for training
var X_train = cars_shuffled.slice(0, train_count).map(x => x[0]);
var X_test = cars_shuffled.slice(train_count).map(x => x[0]);
var Y_train = cars_shuffled.slice(0, train_count).map(x => x[1]);
var Y_test = cars_shuffled.slice(train_count).map(x => x[1]);
var names_train = cars_shuffled.slice(0, train_count).map(x => x[2]);
var names_test = cars_shuffled.slice(train_count).map(x => x[2]);
return [X_train, X_test, Y_train, Y_test, names_train, names_test];
}
Insert cell
Insert cell
lr = new ML.MultivariateLinearRegression(regression_data_split[0], regression_data_split[2])
Insert cell
lr_results = lr.predict(regression_data_split[1])
Insert cell
Insert cell
Insert cell
Insert cell
SS = require('simple-statistics')
Insert cell
// the r^2 function in simple-statistics doesn't actually work for our case so we'll write our own.
r_squared = {
var data_mean = SS.mean(regression_data_split[3].map(x=>x[0]));
var tss = 0;
var res = 0;
for (var i = 0; i < lr_results.length; i++) {
tss += Math.pow(regression_data_split[3][i][0] - data_mean, 2);
res += Math.pow(regression_data_split[3][i][0] - lr_results[i][0], 2);
}
return 1 - res/tss;
}
Insert cell
Insert cell
Insert cell
clustering_data = {
var cars_data = cars_filtered.map((row) =>
[row.Cylinders, row.Displacement, row.Horsepower, row.Weight_in_lbs, row.Acceleration, row.Miles_per_Gallon]);
// note: for MultivariateLinearRegression the output has to be an array
return cars_data;
}
Insert cell
Insert cell
km_results = {
// we will be using 3 clusters for now.
var km = ML.KMeans(clustering_data, 3);
return km;
}
Insert cell
Insert cell
pca = new ML.PCA(clustering_data)
Insert cell
pca_results = pca.predict(clustering_data)
Insert cell
Insert cell
pca_data = {
var d1 = pca_results.getColumn(0);
var d2 = pca_results.getColumn(1);
var temp_data = [];
for (var i = 0; i < cars_filtered.length; i++) {
temp_data.push({'name': cars_filtered[i].Name,
'd1': d1[i],
'd2': d2[i],
'cluster': km_results.clusters[i]});
}
return temp_data;
}
Insert cell
vl.markPoint()
.data(pca_data)
.encode(
vl.x().fieldQ('d1'),
vl.y().fieldQ('d2'),
vl.color().fieldN('cluster'),
vl.tooltip().fieldN('name'),
)
.render()
Insert cell
Insert cell
// There's probably a better way to do this with arquero or something like that
joint_data = {
var temp_data = [];
for (var i = 0; i < cars_filtered.length; i++) {
var c = cars_filtered[i];
c.d1 = pca_data[i].d1;
c.d2 = pca_data[i].d2;
temp_data.push(c);
}
return temp_data;
}
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
pca.getExplainedVariance()
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
mobilenet = (await require('@tensorflow-models/mobilenet@0.2')).load()
Insert cell
Insert cell
Insert cell
image_data = Files.url(image_input)
Insert cell
image = {
var image = new Image();
image.src = image_data;
return image
}
Insert cell
Insert cell
mobilenet.classify(image)
Insert cell
Insert cell
Insert cell
toxicity = require('@tensorflow-models/toxicity')
Insert cell
classifier = await toxicity.load(0.9)
Insert cell
// These are sentence examples from a wikipedia comments dataset that was used to train similar models (not this model).
// see: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/
lines = ["Stop undoing my edits or die!", // a "toxic" comment
"UNBLOCK ME OR I'LL GET MY LAWYERS ON TO YOU FOR BLOCKING MY CONSTITUTIONAL RIGHT TO FREE SPEECH", // another "toxic" comment
"Hi, good day.", // a nontoxic comment
"Thanks for entering this article.", // another nontoxic comment
"idiot did u ever read anything?", // marked as toxic, but try changing the 'o' in 'idiot' to a '0'.
"Considering the vile and perverse anti-Egyptian racism spewed on this talk page on a regular basis, which is all over many Arab countries, especially the Gulf, it is relevant to examine it on an article about Arabs.", // this is marked as a "toxic" comment in the training set.
"Who the hell r u and why do u care neway it was just a bit of fun. Get a life",
"I hate you.", // it's marked as toxic, but are there contexts where this is not toxic?
"I'm working abroad in Australia, and was surprised to find out the green sSkittles over here seem to be Apple flavour.",
"F F F F F F F" // this is a pathological case.
]
Insert cell
results = await classifier.classify(lines)
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
sentence_results = await classifier.classify([sentence_input])
Insert cell
sentence_toxicity_chance = sentence_results[6].results[0].probabilities[1]
Insert cell
Insert cell
Insert cell
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more