Javascript bindings to the Hathi Features data / Benjamin Schmidt

class Corpus {

constructor() {

this.volumes = new Map()

}

*add_ids(ids) {

}

lib = new Library().fetch_all(["mdp.39076002651854", "pst.000061166424"])

html`<h2>Top tokens</h2>${lib.chunks({"doc": "volume", "what": "word"}).sort((a, b) => b.count - a.count).slice(0, 10).map(d => `<strong>${d.word}</strong> ${d.count} <br/>`).join('')}`

data = fetch(

`https://corsproxy.io/?${encodeURIComponent(https://data.analytics.hathitrust.org/features-2020.03/${id_to_stubbytree(

htid

)}}`

)

.then(function (response) {

return response.arrayBuffer();

})

.then((data) => bz2.decompress(new Uint8Array(data)))

.then((array) => JSON.parse(new TextDecoder().decode(array)))

unzip = require('compressjs@1.0.3/main.js').catch(() => window["define"])

class Volume2 {

constructor(htid) {

this.htid = htid;

this.download();

}

download() {

if (this._json) {

return Promise.resolve(this._json);

}

const data = fetch(

`https://corsproxy.io/?https://data.analytics.hathitrust.org/features-2020.03/${id_to_stubbytree(

htid

)}`

)

.then(function (response) {

return response.arrayBuffer();

})

.then((data) => bz2.decompress(new Uint8Array(data)))

.then((array) => {

const json = JSON.parse(new TextDecoder().decode(array));

this._json = json;

return json;

});

}

bz2 = require('bz2@1.0.1/index.js').catch(() => window["bz2"])

htid = "mdp.39076002651854"

id_to_stubbytree(htid)

g = fetch(

`https://corsproxy.io/?https://data.analytics.hathitrust.org/features-2020.03/${id_to_stubbytree(

htid

)}`

)

id_to_stubbytree = function(htid) {

const htid_clean = htid.replace("/", "+").replace(":", "=")

const [libid, volid] = htid.split('.')

const volid_clean = volid.replace("/", "+").replace(":", "=")

const filename = volid_clean

return [libid, volid_clean.split("").filter((d, i) => i % 3 == 0).join(""), htid_clean + ".json.bz2"].join("/")

}

class Volume {

// A class to describe a Hathi Trust Volume.

// Most methods are broken until the 'fetch' method has completed loading;

// to build this into code, use Volume('yourid').load(), which returns the completed

// volume inside a promise.

constructor(id) {

this.id = id;

this.chunk_cache = new Map();

this.fetch();

this.options = {

section: ["body"],

what: "word",

chunk: "page",

size: 10000

};

}

promise_from_htrc() {

// First check a localStorage cache.

let cache;

if (window && window.indexedDB && false) {

// cache = idb.get(this.id)

//return Promise.resolve(JSON.parse(window.localStorage.getItem(this.id)))

} else {

cache = Promise.resolve(undefined);

}

return cache.then((cached_version) => {

if (cached_version !== undefined) {

console.log("Returning " + this.id + " from local cache");

return cached_version;

}

console.log("fetching " + this.id + " remotely");

return d3.json(

`https://corsproxy.io/?https://data.analytics.hathitrust.org/htrc-ef-access/get?action=download-ids&id=${this.id}&output=json`

);

// .then(data => {

// if (window) {

/* if (window && window.indexedDB && idb) {

idb.set(this.id, data)

}

} */

// return data

// })

});

}

fetch() {

// Fetch returns a promise to the data, and populates the cache at this.data.

// 'load' is a more helpful wrapper that returns the whole `Volume` object itself wrapped in a promise.

// Once it has executed, you can access the data element directly, but it will often be necessary

// to wrap the fetch method in some promise resolution to work with it.

if (this.promise) {

return this.promise;

}

this.promise = this.promise_from_htrc().then((d) => {

this.data = d;

return d;

});

return this.promise;

// Defaults; likely to be overriden.

this.options = {

size: 10000,

what: "words",

doc: "page",

ntiles: 24

};

}

async load() {

await this.fetch();

return this;

}

total_word_count(options) {

// This is also in the metadata.

const loc_counts = this.word_counts(options);

const tot = Array.from(loc_counts.values()).reduce(

(left, right) => +left + right,

0

);

return tot;

}

repr() {

// Return an HTML representation of the book suitable for linking back to Hathi.

const { title, pubDate, names, imprint, volumeIdentifier } =

this.data.metadata;

return `${names.join(

"/"

)}. <em>${title}</em>. ${imprint}. <code><a href=https://babel.hathitrust.org/cgi/pt?id=${

this.id

}>${this.id}</a></code>`;

}

count(options) {

if (options.what === undefined) {

options.what = "word";

}

if (options.doc === undefined) {

options.doc = "volume";

}

const { doc } = options;

if (doc == "chunk") {

return this.chunk_counts(options);

} else if (doc == "page") {

return this.page_counts(options);

} else if (doc == "ntile") {

const total = this.total_word_count(options);

const ntile_size = Math.floor(1 + (total + 1) / options.ntiles);

// It's a little sketchy to mutate this array in-place.

// Will confuse people toggling back and forth.

const counts = this.chunk_counts(options, ntile_size);

return counts;

} else if (doc == "volume") {

return [this.word_counts(options)];

}

return "Choose an allowed count type";

}

chunk_counts(options, size) {

// Divide into continuous chunks of size 'size''.

// If 'type' == 'words', will divide into n word chunks

// If 'type' == 'pages', will divide into n/500-page chunks.

// This sets up a cache, so it would possibly to build

// a memory leak.

if (size == undefined && options.size) {

size = options.size;

}

const { chunk_cache } = this;

const { what } = options;

if (chunk_cache.get(`${what}-${size}`)) {

return chunk_cache.get(`${what}-${size}`);

}

const pages = this.page_counts(options);

let lengths = [];

let size2 = size;

//if (what == "words") {

lengths = pages.map((d) => d3.sum(Array.from(d.values())));

/*} else {

lengths = pages.map(d => 1)

}*/

const array_reducer = (a, x, i) => [...a, a.length > 0 ? x + a[i - 1] : x];

const cumulative_lengths = lengths.reduce(array_reducer, []);

const chunk_lab = cumulative_lengths.map((n) => Math.floor(n / size2));

const max_chunk = d3.max(chunk_lab);

const counts = d3.range(max_chunk + 1).map((d) => new Map());

pages.forEach((p, i) => {

merge_counts(counts[chunk_lab[i]], p);

});

chunk_cache.set(`${what}-${size}`, counts);

return counts;

}

word_counts(options) {

const pages = this.page_counts(options);

return pages.reduce(merge_counts, new Map());

}

page_counts(options) {

const { what } = options;

// Default to counting only the section.

options.sections = options.sections || ["body"];

if (!this.data) {

throw "Data must be (asynchronously) loaded; you may need to wrap your code after Volume.fetch().then(data=>{}";

}

return this.data.features.pages.map((d) => new Page(d).counts(options));

}

class Page {

// A single page in an EF volume.

constructor(data, volume) {

this.data = data

this.volume = volume

}

counts(options) {

const { what, sections, regex } = options

const page = this.data

const counts = new Map()

sections.forEach(section => {

if (page[section]===undefined) {{}}

Object.entries(page[section].tokenPosCount).forEach((o) => {

const [word, v] = o

Object.entries(v).forEach((o2) => {

const [pos, count] = o2;

if (regex && !word.match(regex)) {

return

}

if (options.pos && options.pos != pos) {

return

}

let key = word

if (what == 'pos') {

key = pos

} else if (what == 'word&pos') {

key = `${word}%&%${pos}`

} else if (what == 'lowercase') {

key = word.toLowerCase()

}

counts.set(key, (counts.get(key)|| 0) + parseInt(count))

})

return counts

}

lib.top_words(100, {what: 'lowercase'})

class Library {

constructor(ids = []) {

this.m = new Map()

for (const k of ids) {

this.m.set(k, new Volume(k))

}

/* OPERATORS: apply to chunks and update them */

word_counts(options) {

const accessor = (options.what == "word" || options.what == 'lowercase') ? d => d.word : d => '' + d.word + d.pos

const total_counts = d3.rollup(this.chunks(options), vals => d3.sum(vals.map(v => v.count)), accessor)

return total_counts

}

top_words(n, options = {'what': 'word'}) {

const counts = this.word_counts(options)

return ([...counts.entries()]).sort((a, b) => b[1] - a[1]).map(d => d[0]).slice(0, n)

}

haplax(chunks, options) {

// Tag each chunk by whether this is the only occurrence in the corpus.

const accessor = options.what == "word" ? d => d.word : d => '' + d.word + d.pos

const total_counts = d3.rollup(chunks, vals => d3.sum(vals.map(v => v.count)), accessor)

chunks.forEach(chunk => {

// Is the count here the total count? If so, this is a corpus haplax.

chunk.haplax = total_counts.get(accessor(chunk)) == chunk.count

})

return chunks

}

dunning(chunks, options) {

// Does not check for non-occurring words in each chunk.

const lib = this

const documents = d3.group(chunks, d => d.index)

const wordCounts = d3.rollup(chunks, d => d3.sum(d.map(e => e.count)), d => d.word)

const total_words = d3.sum(wordCounts.values())

const output = []

for (const [doc_index, elements] of documents) {

const c = d3.sum(elements.map(d => d.count))

const d = total_words - c

elements.forEach(element => {

// Formula from http://wordhoard.northwestern.edu/userman/analysis-comparewords.html

// How many times is this word in this chunk?

const { word, count } = element

const a = count

// How many times is the word *out* of this chunk?

const b = d3.max([1, wordCounts.get(word) - a])

const E1 = c*(a+b)/total_words

const E2 = d*(a+b)/total_words

// Bind the calculations to the object in-place

// PMI including itself in the corpus.

element.PMI = Math.log(a/E1)

// PMI compared to external corpus.

element.PMI_2 = Math.log((a/c)/(d3.max([1, b])/d))

element.dunning = 2*((a*Math.log(a/E1)) + (b*Math.log(b/E2)))

})

}

return chunks//.sort((a, b) => b.PMI*b.dunning - a.PMI*a.dunning).filter(p => p.PMI > 0)

}

tfidf(chunks, options) {

const documents = d3.group(chunks, d => d.index)

const total_words = d3.sum(chunks.map(chunk => chunk.count))

const n_documents = Array.from(documents).length;

const idf = d3.rollup(chunks, worddata => Math.log(n_documents/worddata.length), d => d.word)

for (const elements of documents.values()) {

const doc_wordcount = d3.sum(elements.map(d => d.count))

elements.forEach(element => {

const { count, word } = element;

const tf = count/doc_wordcount

element.tf = tf

element.tfidf = tf * idf.get(word)

})

}

PMI(chunks, options) {

// PMI and dunning are assigned at the same time.

return this.dunning(chunks, options)

}

/* Basic strategies: ccreate chunks. This is the primary method. */

chunks(options, additional_metrics = []) {

console.log(options)

const output = []

let total_index = 0

for (let [htid, volume] of this.m.entries()) {

let chunk_num = 0;

for (const chunk of volume.count(options)) {

total_index ++;

chunk_num += 1;

for (const [key, count] of chunk.entries()) {

let word = undefined

let pos = undefined

if (options.what == "word&pos") {

[word, pos] = key.split("%&%")

} else if (options.what == "word" || options.what == 'lowercase') {

word = key

} else if (options.what == "pos") {

pos = key

}

output.push({

'word': word,

'pos': pos,

'count': count,

'vol_index': chunk_num,

'index': total_index,

'htid': htid

})

}

additional_metrics.forEach(metric => this[metric](output, options))

return output

}

get(id) {

const { m } = this;

if (m.get(id) !== undefined) {

return m.get(id)

} else {

const v = new Volume(id)

m.set(id, v)

return v

}

fetch_all(ids = []) {

// asynchronously ensure that all books are loaded, and then return

// the library itself.

this.update(ids)

const volumes = Array

.from(this.m.values())

.map(book => book.fetch())

return Promise.all(volumes)

.then(dummy => this)

}

update(ids) {

// Quickly update the internal library with every element.

ids.forEach(id => this.get(id))

return this

}

repr() {

const books = Array.from(this.m.values())

return Promise.all(books.map(book => book.fetch()))

.then( (data) => {

const reprs = books.map(book => book.repr())

let output = "<h3>Books plotted</h3><ol>"

reprs.forEach(e => {

const element = `<li>${e}</li>`

output += `\n${element}\n`

})

output += "</ol>"

return output

})

}

function merge_counts(a, b) {

// Merge two counts dicts together into one.

// Modifies the left-hand set in-place; beware of side effects.

for (const [k, v] of b.entries()) {

a.set(k, (a.get(k) || 0) + b.get(k))

}

return a

}

md`# Tests

Here are some unit tests.

`

lib.chunks({'doc': 'ntile', 'ntiles': 100, what: 'word'}).length > lib.chunks({'doc': 'ntile', 'ntiles': 2, what: 'word'}).length

lib.chunks({'doc': 'chunk', 'size': 100000, 'what': 'page'})

// Are there 100 percentiles?

percentile_test = d3.max(lib.chunks({'doc': 'ntile', 'ntiles': 100, what: 'word'}).map(d => d.vol_index)) == 100

// Does it run Dunning scores successfully when asked?

dunning_exists = lib.chunks({'doc': 'ntile', 'ntiles': 20}, ["dunning", "tfidf"])[0].dunning != undefined

v1 = Array.from(lib.m.values())[0]

word_counts_over_100 = v1.total_word_count({'what': "word", 'doc': "page"}) > 100

md`# Outside imports`

cola = require("webcola@3/WebCola/cola.min.js")

d3 = require('d3', 'd3-fetch', 'd3-array')

Purpose-built for displays of data