Published
Edited
Sep 4, 2020
Importers
8 stars
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
chart = {
const svg = d3.create("svg")
.attr("viewBox", [0, 0, width, height]);
svg.append("rect")
.attr("width", "100%")
.attr("height", "100%")
.attr("class", "svgBackground");
// create the nouns as text elements
const noun = svg.append("g")
.attr("class", "data")
.selectAll("text.noun")
.data(data)
.join("text")
.text(d => d.terms.map(m => m.text).join(" "))
.attr("x", margin.left + center.line)
.attr("y", (d, i) => (i+1) * textSpace + margin.top)
.attr("text-anchor", "end");
// creat each adjective as a tspan with its own font styling
const adj = svg.append("g")
.attr("class", "data")
adj.selectAll("text.adj")
.data(data)
.join("text")
.attr("x", margin.left + center.line + center.padding)
.attr("y", (d, i) => (i+1) * textSpace + margin.top)
.selectAll("tspan")
.data(d => d.adj)
.join("tspan")
.text(d => d.reduced + " ")
.attr("font-weight", d => countScale(d.count));
// mousoever tooltip for each adjective; showing the clause where the adjective is used
adj.selectAll("text")
.selectAll("tspan")
.append("title")
.text((d, i) => `COUNT: ${d.count}\n${d.sentence.join("\n")}`);
// margin gutter rectangle
svg.append("rect")
.attr("class", "svgBackground")
.attr("x", width - margin.right)
.attr("width", margin.right)
.attr("height", "100%");
yield svg.node();
// our axis range depends on the size of the textbox - ie. 12 svg tspan elements representing the adjectives for "princess" take up aprox. "X" pixels (giving us our max range). Need to yield first to know how wide the textboxes are.
const textWidth = adj.node().getBBox();
nounScale.range([margin.left + center.line + center.padding, textWidth.x + textWidth.width])
svg.append("g")
.call(xAxis);
svg.append("g")
.call(grid);
svg.call(xTitle)
svg.call(legend)
}
Insert cell
Insert cell
Insert cell
// grab all the characters - and grab all the adjectives associated with those characters
data = {
let nouns = "";
const words = 15;
const improper = (textSelect == "grimm");
//Included an "if statement" to toggle whether a text should be processed to find characters by ProperNouns (eg. Jungle Book) or not (eg. Grimm).
if(improper){ // grab the top 15 Nouns/characters out of the text
nouns = preProcess.nouns().ifNo(stop_words).sort('freq').unique().slice(0, words).json();
} else {
nouns = preProcess.nouns().ifNo(stop_words).sort('freq').if('#ProperNoun').unique().slice(0, words).json();
}
// loop through the top 20 nouns and grab the direct adjectives
nouns.forEach(m => {
var currentNoun = m.text;
var directAdj = preProcess.match(currentNoun) // grab the direct adjectives of the noun
.nouns()
.adjectives();
m.adj = directAdj.out('frequency') //sort the adjectives by frequency and append them into the metadata as an obj
// now loop through the adjectives we just got and grab their corresponding sentences
m.adj.forEach(d => {
var currentAdj = d.reduced;
var passages = directAdj.if(currentAdj) //match each adj one at a time and grab substrings where they occur
.sentences() // when we return senctences it causes some double ups
.unique() // this is why we call .unique() to remove any duplicates
.match(`.{0,5}? ${currentNoun} .{0,5}?`) //wildcard match the adj: look around the noun
.match(`.{0,5}? ${currentAdj} .{0,5}?`); //both directions, ? makes wildcards optional
d.sentence = passages.out('array') // append them into the json
})
})
return nouns
}
Insert cell
// txt file attachments - data from gutenberg.org
texts = [await FileAttachment("The Brothers Grimm Fairy Tales.txt").text(),
await FileAttachment("Aesop-s Fables.txt").text(),
await FileAttachment("The Jungle Book.txt").text(),
await FileAttachment("The Wonderful Wizard of Oz.txt").text(),
await FileAttachment("Wind in the Willows.txt").text(),
// await FileAttachment("Alice in Wonderland.txt").text(),
// await FileAttachment("Buster Bear.txt").text(),
// await FileAttachment("Arabian Nights pt. 1.txt").text(),
// await FileAttachment("Around the World in 80 Days.txt").text(),
// await FileAttachment("Peter Rabbit.txt").text(),
// await FileAttachment("Reynard the Fox.txt").text(),
//await FileAttachment("Anne of Green Gables.txt").text()
]
Insert cell
input = {
let d = null;
if(textSelect === "grimm"){d = texts[0]}
else if(textSelect === "aesop"){d = texts[1]}
else if(textSelect ==="jungle"){d = texts[2]}
else if(textSelect === "oz" ){d = texts[3]}
else if(textSelect === "wind" ){d = texts[4]}
//else if(textSelect === "anne" ){d = docs[11]}
//else if(textSelect === "world"){d = docs[6]}
//else if(textSelect === "peter"){d = docs[7]}
//else if(textSelect === "fox" ){d = docs[8]}
//else if(textSelect === "bear" ){d = docs[3]}
//else if(textSelect === "night"){d = docs[4]}
//else if(textSelect === "alice"){d = docs[1]}
return d
}
Insert cell
textParse = function(text){
return text.slice(0, 150000) // 150k characters, approx 25k words
.replace(/\r/gi, "")
.replace(/\n/gi, " ")
.replace(/_/gi, "")
}
Insert cell
// in total one text takes about 15-20 seconds to parse with nlp
// keep in its own cell so it only has to run once when the input text is loaded and cache it until we actually have to mutate the document
doc = {
let nlpDoc = await nlp(textParse(input))
nlpDoc.cache()
return nlpDoc
}
Insert cell
// nlp normalize converts our nouns to a reduced form - ie. no plurals or possessives or punctuation, and is used for easier matching. nlp.normalize() accepts an object with default parameters set to booleans. For full list of parameters and more explanation on normalize() see https://observablehq.com/@spencermountain/compromise-normalization
normalParams = {
return {
parentheses: true,
possessives: true,
plurals: true,
}
}
Insert cell
preProcess = await doc.normalize(normalParams)
Insert cell
// adapted from stopwords compiled by https://observablehq.com/@amal994/visualizing-unstructured-text
stop_words="(a|again|couldn't|further|but|still|thence|across|never|fill|don|are|beforehand|serious|you'll|mustn't|ever|ourselves|you've|of|else|it's|about|elsewhere|others|am|when|empty|thereby|now|ltd|very|nowhere|wasn|didn|whose|last|hadn|under|ll|get|what|de|few|as|ie|below|without|me|four|won|among|less|becomes|everywhere|would|an|least|into|whole|however|go|wouldn|whereas|eleven|nobody|there|aren't|bottom|beyond|twenty|first|not|didn't|before|alone|doesn|at|throughout|while|through|with|via|whatever|nor|ours|both|around|made|against|shouldn't|upon|can|must|shouldn|even|fifty|up|his|they|someone|we|also|be|namely|move|amoungst|many|none|another|whenever|any|those|between|hundred|our|himself|may|much|i|every|inc|he|show|hasnt|interest|ma|haven|sometime|if|thick|fifteen|than|see|therein|three|that'll|after|whoever|seems|until|too|anyone|moreover|do|mostly|hereupon|back|from|these|thing|always|ve|whereby|former|along|amongst|rather|shan|this|should|nevertheless|here|having|once|full|yours|yourselves|often|because|the|mightn't|hadn't|several|couldnt|seemed|con|haven't|whether|two|seeming|does|she's|by|so|toward|perhaps|to|just|wouldn't|for|doing|anything|don't|cannot|six|who|will|more|has|did|or|somewhere|fire|is|bill|thus|please|weren|latterly|sometimes|twelve|whereafter|already|side|only|eg|was|put|mill|hence|such|thereafter|eight|third|had|all|indeed|towards|theirs|detail|wherever|its|aren|yet|next|meanwhile|itself|should've|everything|since|hasn|mustn|seem|whereupon|myself|it|sincere|whence|except|whither|being|anyway|then|wasn't|everyone|noone|although|cant|over|part|mine|him|hasn't|otherwise|above|out|thereupon|us|them|wherein|yourself|been|becoming|find|you|during|five|take|somehow|ain|something|neither|onto|might|their|thru|system|how|needn|within|forty|name|together|whom|same|top|why|have|isn't|front|one|hers|down|formerly|themselves|mightn|found|and|give|where|beside|her|own|amount|could|anywhere|were|weren't|which|became|due|in|isn|hereafter|my|she|anyhow|your|couldn|un|some|nine|per|latter|either|describe|no|won't|almost|on|re|afterwards|hereby|well|keep|herself|etc|herein|nothing|needn't|enough|each|other|most|doesn't|become|that|sixty|besides|off|though|therefore|done|shan't|co|thin|you'd|call|ten|you're|cry|behind|0|1|2|3|4|5|6|7|8|9)"
Insert cell
Insert cell
// frequency an adjective acts on a given noun (eg. king is called "old" 9 times in the text)
countScale = d3.scaleThreshold()
.domain([2, 3, 4, 6])
.range([100, 300, 500, 700, 900])
Insert cell
//number of unique adjectives per noun - (eg. princess has 12 unique adj)
nounScale = d3.scaleLinear()
.domain([0, d3.max(data.map(d => d.adj.length))])
.range([margin.left + center.line + center.padding, width - margin.right]) //max range is a placeholder
Insert cell
xAxis = g => g
.attr("transform", `translate(0,${margin.top + textSpace * data.length + 10})`)
.call(d3.axisBottom(nounScale).ticks(Math.floor(d3.max(data.map(d => d.adj.length))/4))) //aprox number of ticks
.call(g => g.select(".domain").remove())
Insert cell
xTitle = g => g.append("text")
.attr("font-family", "sans-serif")
.attr("font-size", 12)
.attr("text-anchor", "middle")
.attr("x", width / 2)
.attr("y", margin.top + textSpace * data.length +40)
.text("Approximate number of unique adjectives")
Insert cell
legend = g => {
g.attr("class", "other")
g.append("text")
.attr("text-anchor", "end")
.attr("x", margin.left + center.line)
.attr("y", margin.top)
.text("Character");
g.append("text")
.attr("x", margin.left + center.line + center.padding)
.attr("y", margin.top)
.text("List of adjectives, weighted by frequency:")
.selectAll("tspan")
.data([{value: 1, text: "1"},
{value: 2, text: "2"},
{value: 3, text: "3"},
{value: 5, text: "4-5"},
{value: 7, text: "6+"}]) //[2, 3, 4, 6]
.join("tspan")
.attr("font-family", "Roboto")
.attr("font-weight", d => countScale(d.value))
.text(d => " " + d.text)
}
Insert cell
grid = g => g
.attr("stroke", "currentColor")
.attr("stroke-opacity", 0.1)
.attr("stroke-dasharray", "10, 5")
.call(g => g.append("g")
.selectAll("line")
.data(nounScale.ticks(Math.floor(d3.max(data.map(d => d.adj.length))/4))) // math for aprox no. of legend ticks
.join("line")
.attr("x1", d => 0.5 + nounScale(d))
.attr("x2", d => 0.5 + nounScale(d))
.attr("y1", margin.top)
.attr("y2", margin.top + textSpace * data.length + 10))
Insert cell
Insert cell
d3 = require("d3@5")
Insert cell
import {select} from "@jashkenas/inputs"
Insert cell
nlp = {
const nlp = await require('compromise@latest')
let sent = await require('compromise-sentences')
nlp.extend(plugin)
nlp.extend(sent)
return nlp
}
Insert cell
// plugin for some words that are commonly mis-tagged by compromise nlp. Add more if needed.
plugin = function(Doc, world){
world.addWords({
golden: 'Adjective' // important b/c golden normally gets tagged as a Noun in grimms"
})
}
Insert cell
nlp.version
Insert cell
height = 475
Insert cell
margin = ({top: 40, right: 20, bottom: 20, left: 100})
Insert cell
center = ({line: 100, padding: 20})
Insert cell
textSpace = 25
Insert cell
html`
<link href="https://fonts.googleapis.com/css2?family=Lato:wght@400;700&display=swap" rel="stylesheet">
<link href="https://fonts.googleapis.com/css2?family=Roboto:wght@100;300;500;700;900&display=swap" rel="stylesheet">
<style>
.svgBackground {fill: #fffbeb; background-color: #fffbeb;}

.title {
font: 24px "Lato", sans-serif;
fill: #263c54;
font-weight: 700;
}

.other {
font: 20px "Lato", sans-serif;
fill: #263c54;
font-weight: 400;
}

.data {
font: 20px "Roboto", sans-serif;
fill: #692020;
font-weight: 400;
}

</style>`
Insert cell
// OLD ATTEMPT AT WORKING WITH COMPROMISE
// so this is extremely slow but it works... taking up to a min just on a small subset of the data
// data = {
// nlp.extend(plugin)
// // this plugin is just for some thing that I notice get mis-tagged. For exmaple, golden is frequently used as a #Adjective in the brothers grimm story "the golden bird" however it is tagged as a #Noun in standard nlp
// // this data parsing is probably going to look weird. I needed to keep switching between the human readable .json() objects where I am storing things and the nlp() list data structure where I can use the built in matching functions and text identification/tagging. This is probably not best practices for using the compromise api (it runs slower); it would be better to parse everything using the built in matching/tagging, grouping it all under a parent document then converting to json at the last step.
// const noun = doc.terms().normalize(normalParams).nouns(); // nouns as nlp list
// const nounJson = noun.json(); // nouns as json
// // didn't like nlp's counting method so adapting my own.
// nounJson.forEach(d => d.normal = d.normal.replace(/[.]/g, "")) // nlp doesn't remove periods w normalization
// const counts = d3.rollup(nounJson, v => v.length, d => d.normal) // get the counts of each word w rollup
// //filter the data so that we have each unique noun only once, we can do this as soon as the rollup count is done
// const nounObj = nounJson.filter((value, index, self) => self.map(d => d.normal).indexOf(value.normal) === index)
// nounObj.forEach(d => d.count = counts.get(d.normal)) //set the counts as object properties
// nounObj.sort((a, b) => d3.descending(a.count, b.count)) //sort by count
// nounJson.forEach(d => d.adjective = noun.match(d.normal).nouns().adjectives().out('array'));
// return nounObj
// }
Insert cell
Insert cell
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more