Published
Edited
Sep 2, 2020
2 forks
27 stars
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
viewof field_regex = text({"title": "field_regex", description: "Under construction"})
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
function extract_keys(table, name) {
return table.getColumn(name).dictionary.toArray()
}
Insert cell
Insert cell
Insert cell
local_data = {
// Not used; faster, but less flexible than the version below.
return []
const filtered = table.filter(arrow.predicate.col("INSTNM").eq(my_school))
const start = Date.now()
const output = []
for (let row of filtered) {
output.push(row)
}
console.log(`counting took ${Date.now() - start} ms`)
return output
}
Insert cell
Insert cell
Insert cell
local_data3 = count_by(["year", "Discipline", "CIPTitle", "Bachelors"], "degrees")
Insert cell
table.getColumn("CIPTitle").dictionary
Insert cell
table.schema.fields.map(d => d.name)
Insert cell
md`

## Count by functions

My least favorite thing about the Arrow javascript libraries is that they're written in very lightly documented typescript, which puts them a bit past my ability to easily modify. They expose only a few analytics functions, so I've written some of my own to do categorical counts without the overhead of millions of UTF-8 decodes.

This function below is called count_by, but is actually both a custom filter and a count_by.

`
Insert cell
all_titles = extract_keys(table, "CIPTitle")
Insert cell
function count_by(groups, weight, filter_col_name = "INSTNM", filter_vals = my_schools.map(k => schools.indexOf(k))) {

/** First, run a custom filter on the data**/
// Arrow predicates don't allow an 'in' operation, just an 'eq' one.
const start = Date.now()
// return g


// Store a binary mask. Not actually used.
const mask = new Uint8Array(table.length)

let i = 0;
const schools_ix = new Set(filter_vals)
const filter_col = table.getColumn(filter_col_name)
// Store locations of the matching records.
const hits = []
for (let batch of filter_col.chunks) {
for (const ix of batch.indices) {
if (schools_ix.has(ix)) {
mask[i] = 1
hits.push(i)
}
i++
}
}


/** Then roll it up with d3 rollup
Rather than use the table, work across the column indices.
**/
//const filtered = table.filter(arrow.predicate.custom((d, i) => mask[i]))


const cols = groups.map(group => {
const col = table.getColumn(group);
if (col.indices) return col.indices
return col
})



function *yielder() {
let value = table.getColumn(weight);

for (let ix of hits) {
const obj = {};
const row = [
value.get(ix),

...cols.map((col, col_index) => {
//console.log(col)
const val = col.get(ix)
/*if (renamings[col_index]) {
return renamings[col_index][val]
}*/
return val
})
]
yield row
}
}
console.log("_________")
const rolled = d3.rollup(
yielder(),
entries => d3.sum(entries.map(entry => entry[0])), // Add up the weights at the end.
...cols.map((col, i) => row => row[i+1])
)

console.log(`counting off the base data took ${Date.now() - start} ms`)
const renamings = new Map()
for (let group of groups) {
const dict = table.getColumn(group).dictionary
if (dict) renamings.set(group, dict.toArray())
}

const flattened = flatten_map(rolled, groups, renamings, weight)
return flattened
}
Insert cell
function table_iter(table, columns) {
let chunk_num = 0;
let row_num = 0;
let i = 0;
let cols = columns.map(name => {
})
}
Insert cell
col = table.getColumn("INSTNM").chunks[0].indices.data.values
Insert cell
function flatten_map(obj, keylist, renamings, last_name = "count") {
// Flatten a nested map for which we know the key names.
// Recursive to move down the map levels.
// Renamings is an optional map of type {key => map}, where the interior map is the dictionary lookup keys
// so that arrow can group on dictionary integers rather than having to decode the utf-8
if (!obj.entries) {
const r = {}
r[last_name] = obj
return [r]
}
const rows = []
const group_name = keylist[0]
for (let [k, v] of obj.entries()) {
const lower = flatten_map(v, keylist.slice(1, keylist.length + 1), renamings, last_name)
rows.push(lower.map(obj => {
if (renamings.has(group_name)) {
obj[group_name] = renamings.get(group_name)[k]
} else {
obj[group_name] = k
}
return obj
}))
}
return rows.flat()
}
Insert cell
my_schools = {
const f = presets.filter(p => p.label==my_school_type)[0].value
if (f !== null) {
return school_info.filter(f).map(d => d.INSTNM)
} else {
return [my_school]
}

}
Insert cell
md`# Data Loading`
Insert cell
buff = fetch("https://benschmidt.org/degrees.feather").then(d => d.arrayBuffer())
Insert cell
table = arrow.Table.from(buff)
Insert cell
school_info = d3.csv("https://benschmidt.org/hd2017.csv")
Insert cell
import {autoSelect, select, checkbox, text} from "@jashkenas/inputs"

Insert cell
overall_totals = {
const majors = d3.rollup(table, r => d3.sum(r.map(row => row.degrees)), d => d.get("Discipline"), d => d.get("year"))
return majors
}
Insert cell
d3 = require('d3@v5', 'd3-array')
Insert cell
arrow = require("apache-arrow@0.17")
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more