Published
Edited
Sep 2, 2020
2 forks
27 stars
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
viewof field_regex = text({"title": "field_regex", description: "Under construction"})
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
function extract_keys(table, name) {
return table.getColumn(name).dictionary.toArray()
}
Insert cell
Insert cell
Insert cell
local_data = {
// Not used; faster, but less flexible than the version below.
return []
const filtered = table.filter(arrow.predicate.col("INSTNM").eq(my_school))
const start = Date.now()
const output = []
for (let row of filtered) {
output.push(row)
}
console.log(`counting took ${Date.now() - start} ms`)
return output
}
Insert cell
Insert cell
Insert cell
local_data3 = count_by(["year", "Discipline", "CIPTitle", "Bachelors"], "degrees")
Insert cell
table.getColumn("CIPTitle").dictionary
Insert cell
table.schema.fields.map(d => d.name)
Insert cell
md`

## Count by functions

My least favorite thing about the Arrow javascript libraries is that they're written in very lightly documented typescript, which puts them a bit past my ability to easily modify. They expose only a few analytics functions, so I've written some of my own to do categorical counts without the overhead of millions of UTF-8 decodes.

This function below is called count_by, but is actually both a custom filter and a count_by.

`
Insert cell
all_titles = extract_keys(table, "CIPTitle")
Insert cell
function count_by(groups, weight, filter_col_name = "INSTNM", filter_vals = my_schools.map(k => schools.indexOf(k))) {

/** First, run a custom filter on the data**/
// Arrow predicates don't allow an 'in' operation, just an 'eq' one.
const start = Date.now()
// return g


// Store a binary mask. Not actually used.
const mask = new Uint8Array(table.length)

let i = 0;
const schools_ix = new Set(filter_vals)
const filter_col = table.getColumn(filter_col_name)
// Store locations of the matching records.
const hits = []
for (let batch of filter_col.chunks) {
for (const ix of batch.indices) {
if (schools_ix.has(ix)) {
mask[i] = 1
hits.push(i)
}
i++
}
}


/** Then roll it up with d3 rollup
Rather than use the table, work across the column indices.
**/
//const filtered = table.filter(arrow.predicate.custom((d, i) => mask[i]))


const cols = groups.map(group => {
const col = table.getColumn(group);
if (col.indices) return col.indices
return col
})



function *yielder() {
let value = table.getColumn(weight);

for (let ix of hits) {
const obj = {};
const row = [
value.get(ix),

...cols.map((col, col_index) => {
//console.log(col)
const val = col.get(ix)
/*if (renamings[col_index]) {
return renamings[col_index][val]
}*/
return val
})
]
yield row
}
}
console.log("_________")
const rolled = d3.rollup(
yielder(),
entries => d3.sum(entries.map(entry => entry[0])), // Add up the weights at the end.
...cols.map((col, i) => row => row[i+1])
)

console.log(`counting off the base data took ${Date.now() - start} ms`)
const renamings = new Map()
for (let group of groups) {
const dict = table.getColumn(group).dictionary
if (dict) renamings.set(group, dict.toArray())
}

const flattened = flatten_map(rolled, groups, renamings, weight)
return flattened
}
Insert cell
function table_iter(table, columns) {
let chunk_num = 0;
let row_num = 0;
let i = 0;
let cols = columns.map(name => {
})
}
Insert cell
col = table.getColumn("INSTNM").chunks[0].indices.data.values
Insert cell
function flatten_map(obj, keylist, renamings, last_name = "count") {
// Flatten a nested map for which we know the key names.
// Recursive to move down the map levels.
// Renamings is an optional map of type {key => map}, where the interior map is the dictionary lookup keys
// so that arrow can group on dictionary integers rather than having to decode the utf-8
if (!obj.entries) {
const r = {}
r[last_name] = obj
return [r]
}
const rows = []
const group_name = keylist[0]
for (let [k, v] of obj.entries()) {
const lower = flatten_map(v, keylist.slice(1, keylist.length + 1), renamings, last_name)
rows.push(lower.map(obj => {
if (renamings.has(group_name)) {
obj[group_name] = renamings.get(group_name)[k]
} else {
obj[group_name] = k
}
return obj
}))
}
return rows.flat()
}
Insert cell
my_schools = {
const f = presets.filter(p => p.label==my_school_type)[0].value
if (f !== null) {
return school_info.filter(f).map(d => d.INSTNM)
} else {
return [my_school]
}

}
Insert cell
md`# Data Loading`
Insert cell
buff = fetch("https://benschmidt.org/degrees.feather").then(d => d.arrayBuffer())
Insert cell
table = arrow.Table.from(buff)
Insert cell
school_info = d3.csv("https://benschmidt.org/hd2017.csv")
Insert cell
import {autoSelect, select, checkbox, text} from "@jashkenas/inputs"

Insert cell
overall_totals = {
const majors = d3.rollup(table, r => d3.sum(r.map(row => row.degrees)), d => d.get("Discipline"), d => d.get("year"))
return majors
}
Insert cell
d3 = require('d3@v5', 'd3-array')
Insert cell
arrow = require("apache-arrow@0.17")
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more