Public
Edited
Jul 28, 2023
Insert cell
md`# Understanding Word Vectors

//mm edits by M.M.

*A js/observable port of Allison Parrish's excellent [Jupyter notebook](https://gist.github.com/aparrish/2f562e3737544cf29aaf1af30362f469) of the same name.*

## Animal similarity and linear algebra

We'll begin by considering a small subset of English: words for animals. Our task is to be able to write programs to find similarities among these words and the creatures they designate. To do this, we might start by making a spreadsheet of some animals and their characteristics. For example:

`
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
distance2d = function(p1, p2){
let dx = p2[0] - p1[0];
let dy = p2[1] - p1[1];
return Math.sqrt(dx * dx + dy * dy);
}
Insert cell
Insert cell
distance2d(animal('capybara'), animal('panda bear')) // panda and capybara
Insert cell
Insert cell
distance2d(animal('tarantula'), animal('elephant')) // tarantula and elephant
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
colors['olive']
Insert cell
colors['red']
Insert cell
colors['black']
Insert cell
Insert cell
Insert cell
distance([10, 1], [5, 2]);
Insert cell
Insert cell
Insert cell
subtractv([10, 1], [5, 2])
Insert cell
Insert cell
Insert cell
addv([10, 1], [5, 2])
Insert cell
Insert cell
Insert cell
meanv([[0, 1], [2, 2], [4, 3]])
Insert cell
Insert cell
distance(colors['red'], colors['green']) > distance(colors['red'], colors['pink'])
Insert cell
Insert cell
Insert cell
closest(colors, colors.red)
Insert cell
Insert cell
closest(colors, [150, 60, 150])
Insert cell
Insert cell
closest(colors, subtractv(colors['purple'], colors['red']))
Insert cell
Insert cell
closest(colors, addv(colors['blue'], colors['green']))
Insert cell
Insert cell
// the average of black and white: medium grey
closest(colors, meanv([colors['black'], colors['white']]))
Insert cell
Insert cell
{
// an analogy: pink is to red as X is to blue
let pinkToRed = subtractv(colors['pink'], colors['red'])
return closest(colors, addv(pinkToRed, colors['blue']))
}
Insert cell
Insert cell
{
// another example:
let navyToBlue = subtractv(colors['navy'], colors['blue'])
return closest(colors, addv(navyToBlue, colors['green']))
}
Insert cell
Insert cell
Insert cell
Insert cell
avgColor = {
let doc = await gutenFetch('http://www.gutenberg.org/cache/epub/345/pg345.txt');
let words = RiTa.tokenize(doc), dracColors = [];
words.forEach((w,i) => colors[w.toLowerCase()]
&& dracColors.push(colors[w.toLowerCase()]));
return meanv(dracColors);
}
Insert cell
closest(colors, avgColor)
Insert cell
Insert cell
Insert cell
Insert cell
closest(colors, colors['mauve']).map(cname => cname + " trousers");
Insert cell
Insert cell
Insert cell
Insert cell
tokens = RiTa.tokens(await gutenFetch('http://www.gutenberg.org/cache/epub/345/pg345.txt'));

Insert cell
Insert cell
gloVe.getVector("cheese")
Insert cell
Insert cell
Insert cell
Insert cell
cosine = function(v1, v2) {
let vec1 = tf.tensor1d(v1);
let vec2 = tf.tensor1d(v2);
let dot = vec1.dot(vec2).asScalar();
let sim = dot.div(vec1.norm(2)).div(vec2.norm(2));
return 1 - sim.dataSync()[0];
}
Insert cell
cosine(vec('dog'), vec('puppy')) < cosine(vec('trousers'), vec('octopus'))
Insert cell
Insert cell
gloVeClosest = async function(tokenList, vector, n=10) {
let tensor = tf.tensor1d(vector);
let neighbors = await gloVe._getNearestNeighbors(tensor, n*100, true);
return neighbors.filter(t => tokenList.includes(t.word)).slice(0, n).map(n => n.word);
}
Insert cell
Insert cell
gloVeClosest(tokens, vec("basketball"))
Insert cell
Insert cell
gloVeClosest(tokens, meanv([vec("day"), vec("night")]))
Insert cell
Insert cell
gloVeClosest(tokens, vec("wine"))
Insert cell
Insert cell
gloVeClosest(tokens, subtractv(vec("wine"), vec("alcohol"))) // hmmm, something off here
Insert cell
Insert cell
gloVeClosest(tokens, vec("water"))
Insert cell
Insert cell
gloVeClosest(tokens, addv(vec("water"),vec("frozen")))
Insert cell
Insert cell
gloVeClosest(tokens, vec("grass"))
Insert cell
Insert cell
{
// analogy: blue is to sky as X is to grass
let blueToSky = subtractv(vec("blue"), vec("sky"))
return gloVeClosest(tokens, addv(blueToSky, vec("grass")))
}
Insert cell
Insert cell
sentVec = function(s) {
let words = RiTa.tokenize(s);
return meanv(words.map(w => gloVe.getVector(w)));
}
Insert cell
sentences = RiTa.sentences(await gutenFetch('http://www.gutenberg.org/cache/epub/345/pg345.txt'));
Insert cell
gloVeClosestSent = function(space, inputStr, n=10) {
let inputVec = sentVec(inputStr);
let dists = {};
console.time("cos");
space = space.slice(0,500);
space.forEach(x => dists[x] = cosine(sentVec(x), inputVec));
console.timeEnd("cos");
return Object.keys(dists).sort((a,b) => dists[a] - dists[b]).slice(0,n);
}
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
animal = function(name) { return data.filter(d => d.name === name).map(o => [o.cuteness, o.size])[0] }
Insert cell
RiTa = require('rita@2.4.86');
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
height = 400
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more