Arquero Cookbook / Javier Cardenas

Javier Cardenas

Workspace

Public

Edited

Dec 12, 2023

Fork of Arquero Cookbook

aq.table({

country: ['USA', 'USA', 'Canada', 'Canada'],

medal: ['gold', 'silver', 'gold', 'silver'],

count: [10, 20, 7, 26]

})

.groupby('country') // ensures country is retained as a column

.pivot('medal', 'count') // medal provides keys (column names), count provides values

.view()

aq.table({

country: ['USA', 'USA', 'Canada', 'Canada'],

medal: ['gold', 'silver', 'gold', 'silver'],

count: [10, 20, 7, 26]

})

// note there is no groupby, we will collapse across countries!

.pivot('medal', { total: d => op.sum(d.count) }) // sum counts within a cell

.view()

aq.table({

country: ['USA', 'Canada'],

gold: [10, 7],

silver: [20, 26]

})

.fold(aq.not('country'), { as: ['medal', 'count'] })

.view()

aq.table({ w: ["if I've said it once", "I've said it a thousand times"] })

.derive({ w: d => op.split(d.w, ' ') }) // split on spaces into arrays, replace previous column

.unroll('w') // unroll arrays, with one row per value

.groupby('w') // group by words

.count() // count word occurrences

.view()

{

// import d3-time-format

const d3 = await require('d3-time-format');

return aq.fromCSV(

'date\n1/1/2000\n06/01/2010\n12/10/2020',

{ parse: { date: d3.timeParse('%m/%d/%Y') } }

)

.view()

}

{

// import d3-time-format

const d3 = await require('d3-time-format');

// create and register a date parser under the name 'parse_mdy'

// use { override: true } to suppress errors if we re-evaluate this cell

aq.addFunction('parse_mdy', d3.timeParse('%m/%d/%Y'), { override: true });

return aq

.table({ date: ['1/1/2000', '06/01/2010', '12/10/2020'] })

.derive({ date: d => op.parse_mdy(d.date) })

.view()

}

{

// import d3-time-format

const d3 = await require('d3-time-format');

const parseMDY = d3.timeParse('%m/%d/%Y');

// escape our derive expression to call the function as-is, including closures

return aq

.table({ date: ['1/1/2000', '06/01/2010', '12/10/2020'] })

.derive({ date: aq.escape(d => parseMDY(d.date)) })

.view()

}

function normalize_column(name) {

return name.toLowerCase() // map to lower case

.replace(/[%#$£()\'\"]/g, '') // remove unwanted characters

.replace(/[ /,+.*:\-\r\n@]/g, '_') // replace spacing and punctuation with an underscore

.replace(/_+/g, '_') // collapse repeated underscores

.normalize('NFD') // perform unicode normalization

.replace(/[\u0300-\u036f]/g, ''); // strip accents from characters

}

function normalize(table) {

const name = table.columnNames();

return aq.table({ name, norm: name.map(normalize_column) }) // create table of names & normalized names

.groupby('norm') // group by normalized name

.derive({ index: op.row_number(), count: op.count() }) // count duplicates, generate index for each

.objects() // generate an array of { name, norm } objects

.map(o => ({ [o.name]: o.norm + (o.count > 1 ? `_${o.index}` : '') })); // rename, adding index as needed

}

aq.table({ 'FOO%+BAr': [1], 'FOO++BAr': [2], 'COL.1': [3], 'käse': [4] })

.select(normalize)

.columnNames()

aq.table({

region: ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],

quarter: ['Q1', 'Q2', 'Q3', 'Q4', 'Q1', 'Q2', 'Q3', 'Q4'],

sales: [6, 5, 3, 2, 4, 8, 2, 6]

})

.groupby('quarter') // we want to compute percentage of sales for that quarter

.derive({ percent: d => d.sales / op.sum(d.sales) || 0 }) // include `|| 0` in case sum is zero

.orderby('quarter') // sort by quarter to ease comparison

.view()

aq.table({ x: [1, 1, 2, 2, 3, 3, 4, 4, 5, 5] })

.orderby('x')

.derive({ ra: aq.rolling(d => op.sum(d.x)) })

.view()

aq.table({ x: [1.1, 2.2, 3.2, 4.3, 6.5, 8.6, 9.7, 10.1] })

.orderby('x')

.derive({ ra: aq.rolling(d => op.average(d.x), [-2, 0]) })

.view()

aq.bin('x', { maxbins: 10 })

aq.table({ x: [1, 2, 3, 4, 4, 5, 5, 5, 6, 6, 8, 9, 10] })

.groupby({

bin_0: aq.bin('x', {maxbins: 10}),

bin_1: aq.bin('x', {maxbins: 10, offset: 1})

})

.count()

.view()

// Note: T\the aq.bin and op.bins calls should have the same parameters!

aq.table({ x: [1.1, 3.2, 4.1, 4.2, 5.1, 5.3, 5.4, 6.1, 6.6, 8.8, 9.3, 10] })

.groupby({ bin: aq.bin('x') })

.count()

.impute(

{ count: () => 0 }, // set imputed counts to zero

{ expand: { bin: d => op.sequence(...op.bins(d.bin)) } } // include rows for all bin values

// Note: op.bins(d.bin) -> [start, stop, step]

// so op.sequence(...op.bins(d.bin)) -> op.sequence(start, stop, step)

)

.orderby('bin')

.view();

aq.table({ x: [1, 2, 3, 4, 6, 8, 9, 10] })

.rollup({

average: d => op.average(d.x),

mean: d => op.mean(d.x), // mean and average are synonyms

geom_mean: d => op.exp(op.mean(op.log(d.x))),

harm_mean: d => 1 / op.mean(1 / d.x)

})

.view()

aq.table({ x: [1, 2, 3, NaN, 4, 6, 8, 9, 10] })

.rollup({

samples: d => op.valid(d.x), // invalid values are excluded!

stdev: d => op.stdev(d.x),

stderr: d => op.stdev(d.x) / op.sqrt(op.valid(d.x))

})

.view()

aq.table({ x: [1, 2, 3, 4, 6, 8, 9, 10] })

.rollup({

lo: d => op.mean(d.x) - 1.96 * op.stdev(d.x) / op.sqrt(op.valid(d.x)),

hi: d => op.mean(d.x) + 1.96 * op.stdev(d.x) / op.sqrt(op.valid(d.x))

})

.view()

// compute 1000 bootstrapped mean estimates

// then find quantile boundaries for 95% CI

aq.table({ x: [1, 2, 3, 4, 6, 8, 9, 10] })

.sample(aq.frac(1000), { replace: true })

.derive({ id: d => op.row_number() % 1000 })

.groupby('id')

.rollup({ mu: op.mean('x') })

.rollup({

lo: op.quantile('mu', 0.025), // exclude the bottom 2.5%

hi: op.quantile('mu', 0.975) // exclude the top 2.5%

})

.view()

aq.table({ x: [1, 2, 3, 4, 6, 8, 9, 10] })

.derive({ z: ({ x }) => (x - op.mean(x)) / op.stdev(x) || 0 })

.view()

aq.table({ x: [1, 2, 3, 4, 6, 8, 9, 10] })

.derive({ dev: d => d.x - op.median(d.x) })

.rollup({ mad: d => op.median(op.abs(d.dev)) })

.view()

aq.table({ x: [1, 2, 3, 4, 6, 8, 9, 10] })

.derive({ dev: d => d.x - op.median(d.x) })

.derive({ dev: d => d.dev / op.median(op.abs(d.dev)) || 0 })

.view()

aq.table({

a: [1, 1, 2, 2, 3, 4, 5],

b: [1, 2, 2, 1, 3, 4, 10]

})

.derive({

za: ({ a: x }) => (x - op.mean(x)) / op.stdev(x) || 0,

zb: ({ b: x }) => (x - op.mean(x)) / op.stdev(x) || 0,

})

.derive({

dm: d => {

const s = op.covariance(d.za, d.zb) || 0; // internally calculated once and reused

const di = 1 / (1 - s * s); // inverse determinant of covariance matrix

return op.sqrt(di * (d.za * d.za - 2 * s * d.za * d.zb + d.zb * d.zb));

}

})

.select(aq.not('za', 'zb'))

.view(10)

aq.table({ x: [1, 1, 1, 2, 2, 3] })

.groupby('x').count({ as: 'n' }) // get counts

.derive({ p: d => d.n / op.sum(d.n) }) // from counts to probabilities

.rollup({ H: d => -op.sum(d.p * op.log2(d.p)) }) // entropy

.view()

// first compute histograms for (a, b), a, and b

// then compute mutual information I(a, b) and entropy H(a, b)

// using both we can also compute a normalized distance

aq.table({

a: [1, 1, 2, 2, 3, 4, 5],

b: [9, 8, 8, 8, 7, 6, 5]

})

.groupby('a', 'b').count({as: 'n'})

.groupby('a').derive({ nx: d => op.sum(d.n) })

.groupby('b').derive({ ny: d => op.sum(d.n) })

.ungroup().derive({ s: d => op.sum(d.n) })

.rollup({

I: d => op.sum((d.n / d.s) * op.log2((d.n / d.nx) * (d.s / d.ny))),

H: d => -op.sum((d.n / d.s) * op.log2(d.n / d.s))

})

.derive({ Distance: d => 1 - d.I / d.H })

.view()

aq.table({

a: [1, 2, 2, 3, 4, 5],

b: [9, 8, 8, 7, 6, 5]

})

.rollup({ corr: d => op.corr(d.a, d.b) })

.view()

aq.table({

a: [1, 2, 2, 3, 4, 5],

b: [9, 8, 8, 7, 6, 5]

})

.orderby('a').derive({ ra: op.avg_rank() })

.orderby('b').derive({ rb: op.avg_rank() })

.rollup({ rho: d => op.corr(d.ra, d.rb) })

.view()

aq.table({

a: [3, 4, 5, 6, 7],

b: [1, 2, 3, 4, 5]

})

.rollup({

cohens_d: ({ a, b }) => {

const va = (op.valid(a) - 1) * op.variance(a);

const vb = (op.valid(b) - 1) * op.variance(b);

const sd = op.sqrt((va + vb) / (op.valid(a) + op.valid(b) - 2));

return sd ? (op.mean(a) - op.mean(b)) / sd : 0;

}

})

.view()

aq.table({ x: [1, 2, 3], y: [4, 5, 6] })

.rollup({ dot: d => op.sum(d.x * d.y) })

.view()

aq.table({ x: [1, 2, 3, 4, 5] })

.rollup({

'L1 norm': d => op.sum(op.abs(d.x)),

'L2 norm': d => op.sqrt(op.sum(d.x * d.x)),

'L3 norm': d => op.cbrt(op.sum(op.pow(op.abs(d.x), 3))),

'L∞ norm': d => op.max(op.abs(d.x))

})

.view()

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.

Learn more