Word Associations Stem and Leaf / abebrath

abebrath

Workspace

Published

Visualizing with Text

Edited

Sep 4, 2020

Importers

8 stars

chart = {

const svg = d3.create("svg")

.attr("viewBox", [0, 0, width, height]);

svg.append("rect")

.attr("width", "100%")

.attr("height", "100%")

.attr("class", "svgBackground");

// create the nouns as text elements

const noun = svg.append("g")

.attr("class", "data")

.selectAll("text.noun")

.data(data)

.join("text")

.text(d => d.terms.map(m => m.text).join(" "))

.attr("x", margin.left + center.line)

.attr("y", (d, i) => (i+1) * textSpace + margin.top)

.attr("text-anchor", "end");

// creat each adjective as a tspan with its own font styling

const adj = svg.append("g")

.attr("class", "data")

adj.selectAll("text.adj")

.data(data)

.join("text")

.attr("x", margin.left + center.line + center.padding)

.attr("y", (d, i) => (i+1) * textSpace + margin.top)

.selectAll("tspan")

.data(d => d.adj)

.join("tspan")

.text(d => d.reduced + " ")

.attr("font-weight", d => countScale(d.count));

// mousoever tooltip for each adjective; showing the clause where the adjective is used

adj.selectAll("text")

.selectAll("tspan")

.append("title")

.text((d, i) => `COUNT: ${d.count}\n${d.sentence.join("\n")}`);

// margin gutter rectangle

svg.append("rect")

.attr("class", "svgBackground")

.attr("x", width - margin.right)

.attr("width", margin.right)

.attr("height", "100%");

yield svg.node();

// our axis range depends on the size of the textbox - ie. 12 svg tspan elements representing the adjectives for "princess" take up aprox. "X" pixels (giving us our max range). Need to yield first to know how wide the textboxes are.

const textWidth = adj.node().getBBox();

nounScale.range([margin.left + center.line + center.padding, textWidth.x + textWidth.width])

svg.append("g")

.call(xAxis);

svg.append("g")

.call(grid);

svg.call(xTitle)

svg.call(legend)

}

// grab all the characters - and grab all the adjectives associated with those characters

data = {

let nouns = "";

const words = 15;

const improper = (textSelect == "grimm");

//Included an "if statement" to toggle whether a text should be processed to find characters by ProperNouns (eg. Jungle Book) or not (eg. Grimm).

if(improper){ // grab the top 15 Nouns/characters out of the text

nouns = preProcess.nouns().ifNo(stop_words).sort('freq').unique().slice(0, words).json();

} else {

nouns = preProcess.nouns().ifNo(stop_words).sort('freq').if('#ProperNoun').unique().slice(0, words).json();

}

// loop through the top 20 nouns and grab the direct adjectives

nouns.forEach(m => {

var currentNoun = m.text;

var directAdj = preProcess.match(currentNoun) // grab the direct adjectives of the noun

.nouns()

.adjectives();

m.adj = directAdj.out('frequency') //sort the adjectives by frequency and append them into the metadata as an obj

// now loop through the adjectives we just got and grab their corresponding sentences

m.adj.forEach(d => {

var currentAdj = d.reduced;

var passages = directAdj.if(currentAdj) //match each adj one at a time and grab substrings where they occur

.sentences() // when we return senctences it causes some double ups

.unique() // this is why we call .unique() to remove any duplicates

.match(`.{0,5}? ${currentNoun} .{0,5}?`) //wildcard match the adj: look around the noun

.match(`.{0,5}? ${currentAdj} .{0,5}?`); //both directions, ? makes wildcards optional

d.sentence = passages.out('array') // append them into the json

})

return nouns

}

// txt file attachments - data from gutenberg.org

texts = [await FileAttachment("The Brothers Grimm Fairy Tales.txt").text(),

await FileAttachment("Aesop-s Fables.txt").text(),

await FileAttachment("The Jungle Book.txt").text(),

await FileAttachment("The Wonderful Wizard of Oz.txt").text(),

await FileAttachment("Wind in the Willows.txt").text(),

// await FileAttachment("Alice in Wonderland.txt").text(),

// await FileAttachment("Buster Bear.txt").text(),

// await FileAttachment("Arabian Nights pt. 1.txt").text(),

// await FileAttachment("Around the World in 80 Days.txt").text(),

// await FileAttachment("Peter Rabbit.txt").text(),

// await FileAttachment("Reynard the Fox.txt").text(),

//await FileAttachment("Anne of Green Gables.txt").text()

]

input = {

let d = null;

if(textSelect === "grimm"){d = texts[0]}

else if(textSelect === "aesop"){d = texts[1]}

else if(textSelect ==="jungle"){d = texts[2]}

else if(textSelect === "oz" ){d = texts[3]}

else if(textSelect === "wind" ){d = texts[4]}

//else if(textSelect === "anne" ){d = docs[11]}

//else if(textSelect === "world"){d = docs[6]}

//else if(textSelect === "peter"){d = docs[7]}

//else if(textSelect === "fox" ){d = docs[8]}

//else if(textSelect === "bear" ){d = docs[3]}

//else if(textSelect === "night"){d = docs[4]}

//else if(textSelect === "alice"){d = docs[1]}

return d

}

textParse = function(text){

return text.slice(0, 150000) // 150k characters, approx 25k words

.replace(/\r/gi, "")

.replace(/\n/gi, " ")

.replace(/_/gi, "")

}

// in total one text takes about 15-20 seconds to parse with nlp

// keep in its own cell so it only has to run once when the input text is loaded and cache it until we actually have to mutate the document

doc = {

let nlpDoc = await nlp(textParse(input))

nlpDoc.cache()

return nlpDoc

}

// nlp normalize converts our nouns to a reduced form - ie. no plurals or possessives or punctuation, and is used for easier matching. nlp.normalize() accepts an object with default parameters set to booleans. For full list of parameters and more explanation on normalize() see https://observablehq.com/@spencermountain/compromise-normalization

normalParams = {

return {

parentheses: true,

possessives: true,

plurals: true,

}

preProcess = await doc.normalize(normalParams)

// adapted from stopwords compiled by https://observablehq.com/@amal994/visualizing-unstructured-text

// frequency an adjective acts on a given noun (eg. king is called "old" 9 times in the text)

countScale = d3.scaleThreshold()

.domain([2, 3, 4, 6])

.range([100, 300, 500, 700, 900])

//number of unique adjectives per noun - (eg. princess has 12 unique adj)

nounScale = d3.scaleLinear()

.domain([0, d3.max(data.map(d => d.adj.length))])

.range([margin.left + center.line + center.padding, width - margin.right]) //max range is a placeholder

xAxis = g => g

.attr("transform", `translate(0,${margin.top + textSpace * data.length + 10})`)

.call(d3.axisBottom(nounScale).ticks(Math.floor(d3.max(data.map(d => d.adj.length))/4))) //aprox number of ticks

.call(g => g.select(".domain").remove())

xTitle = g => g.append("text")

.attr("font-family", "sans-serif")

.attr("font-size", 12)

.attr("text-anchor", "middle")

.attr("x", width / 2)

.attr("y", margin.top + textSpace * data.length +40)

.text("Approximate number of unique adjectives")

legend = g => {

g.attr("class", "other")

g.append("text")

.attr("text-anchor", "end")

.attr("x", margin.left + center.line)

.attr("y", margin.top)

.text("Character");

g.append("text")

.attr("x", margin.left + center.line + center.padding)

.attr("y", margin.top)

.text("List of adjectives, weighted by frequency:")

.selectAll("tspan")

.data([{value: 1, text: "1"},

{value: 2, text: "2"},

{value: 3, text: "3"},

{value: 5, text: "4-5"},

{value: 7, text: "6+"}]) //[2, 3, 4, 6]

.join("tspan")

.attr("font-family", "Roboto")

.attr("font-weight", d => countScale(d.value))

.text(d => " " + d.text)

}

grid = g => g

.attr("stroke", "currentColor")

.attr("stroke-opacity", 0.1)

.attr("stroke-dasharray", "10, 5")

.call(g => g.append("g")

.selectAll("line")

.data(nounScale.ticks(Math.floor(d3.max(data.map(d => d.adj.length))/4))) // math for aprox no. of legend ticks

.join("line")

.attr("x1", d => 0.5 + nounScale(d))

.attr("x2", d => 0.5 + nounScale(d))

.attr("y1", margin.top)

.attr("y2", margin.top + textSpace * data.length + 10))

d3 = require("d3@5")

import {select} from "@jashkenas/inputs"

nlp = {

const nlp = await require('compromise@latest')

let sent = await require('compromise-sentences')

nlp.extend(plugin)

nlp.extend(sent)

return nlp

}

// plugin for some words that are commonly mis-tagged by compromise nlp. Add more if needed.

plugin = function(Doc, world){

world.addWords({

golden: 'Adjective' // important b/c golden normally gets tagged as a Noun in grimms"

})

}

nlp.version

height = 475

margin = ({top: 40, right: 20, bottom: 20, left: 100})

center = ({line: 100, padding: 20})

textSpace = 25

html`

<style>

.svgBackground {fill: #fffbeb; background-color: #fffbeb;}

.title {

font: 24px "Lato", sans-serif;

fill: #263c54;

font-weight: 700;

}

.other {

font: 20px "Lato", sans-serif;

fill: #263c54;

font-weight: 400;

}

.data {

font: 20px "Roboto", sans-serif;

fill: #692020;

font-weight: 400;

}

</style>`

// OLD ATTEMPT AT WORKING WITH COMPROMISE

// so this is extremely slow but it works... taking up to a min just on a small subset of the data

// data = {

// nlp.extend(plugin)

// // this plugin is just for some thing that I notice get mis-tagged. For exmaple, golden is frequently used as a #Adjective in the brothers grimm story "the golden bird" however it is tagged as a #Noun in standard nlp

// // this data parsing is probably going to look weird. I needed to keep switching between the human readable .json() objects where I am storing things and the nlp() list data structure where I can use the built in matching functions and text identification/tagging. This is probably not best practices for using the compromise api (it runs slower); it would be better to parse everything using the built in matching/tagging, grouping it all under a parent document then converting to json at the last step.

// const noun = doc.terms().normalize(normalParams).nouns(); // nouns as nlp list

// const nounJson = noun.json(); // nouns as json

// // didn't like nlp's counting method so adapting my own.

// nounJson.forEach(d => d.normal = d.normal.replace(/[.]/g, "")) // nlp doesn't remove periods w normalization

// const counts = d3.rollup(nounJson, v => v.length, d => d.normal) // get the counts of each word w rollup

// //filter the data so that we have each unique noun only once, we can do this as soon as the rollup count is done

// const nounObj = nounJson.filter((value, index, self) => self.map(d => d.normal).indexOf(value.normal) === index)

// nounObj.forEach(d => d.count = counts.get(d.normal)) //set the counts as object properties

// nounObj.sort((a, b) => d3.descending(a.count, b.count)) //sort by count

// nounJson.forEach(d => d.adjective = noun.match(d.normal).nouns().adjectives().out('array'));

// return nounObj

// }

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.

Learn more