Public
Edited
Dec 13, 2022
1 fork
Importers
3 stars
Insert cell
Insert cell
class Corpus {
constructor() {
this.volumes = new Map()
}
*add_ids(ids) {
}
}
Insert cell
lib = new Library().fetch_all(["mdp.39076002651854", "pst.000061166424"])
Insert cell
html`<h2>Top tokens</h2>${lib.chunks({"doc": "volume", "what": "word"}).sort((a, b) => b.count - a.count).slice(0, 10).map(d => `<strong>${d.word}</strong> ${d.count} <br/>`).join('')}`
Insert cell
data = fetch(
`https://corsproxy.io/?${encodeURIComponent(https://data.analytics.hathitrust.org/features-2020.03/${id_to_stubbytree(
htid
)}}`
)
.then(function (response) {
return response.arrayBuffer();
})
.then((data) => bz2.decompress(new Uint8Array(data)))
.then((array) => JSON.parse(new TextDecoder().decode(array)))
Insert cell
unzip = require('compressjs@1.0.3/main.js').catch(() => window["define"])
Insert cell
class Volume2 {
constructor(htid) {
this.htid = htid;
this.download();
}

download() {
if (this._json) {
return Promise.resolve(this._json);
}

const data = fetch(
`https://corsproxy.io/?https://data.analytics.hathitrust.org/features-2020.03/${id_to_stubbytree(
htid
)}`
)
.then(function (response) {
return response.arrayBuffer();
})
.then((data) => bz2.decompress(new Uint8Array(data)))
.then((array) => {
const json = JSON.parse(new TextDecoder().decode(array));
this._json = json;
return json;
});
}
}
Insert cell
bz2 = require('bz2@1.0.1/index.js').catch(() => window["bz2"])
Insert cell
htid = "mdp.39076002651854"
Insert cell
id_to_stubbytree(htid)
Insert cell
g = fetch(
`https://corsproxy.io/?https://data.analytics.hathitrust.org/features-2020.03/${id_to_stubbytree(
htid
)}`
)
Insert cell
id_to_stubbytree = function(htid) {
const htid_clean = htid.replace("/", "+").replace(":", "=")
const [libid, volid] = htid.split('.')
const volid_clean = volid.replace("/", "+").replace(":", "=")

const filename = volid_clean
return [libid, volid_clean.split("").filter((d, i) => i % 3 == 0).join(""), htid_clean + ".json.bz2"].join("/")
}
Insert cell
class Volume {
// A class to describe a Hathi Trust Volume.
// Most methods are broken until the 'fetch' method has completed loading;
// to build this into code, use Volume('yourid').load(), which returns the completed
// volume inside a promise.
constructor(id) {
this.id = id;
this.chunk_cache = new Map();
this.fetch();
this.options = {
section: ["body"],
what: "word",
chunk: "page",
size: 10000
};
}

promise_from_htrc() {
// First check a localStorage cache.
let cache;
if (window && window.indexedDB && false) {
// cache = idb.get(this.id)
//return Promise.resolve(JSON.parse(window.localStorage.getItem(this.id)))
} else {
cache = Promise.resolve(undefined);
}
return cache.then((cached_version) => {
if (cached_version !== undefined) {
console.log("Returning " + this.id + " from local cache");
return cached_version;
}
console.log("fetching " + this.id + " remotely");

return d3.json(
`https://corsproxy.io/?https://data.analytics.hathitrust.org/htrc-ef-access/get?action=download-ids&id=${this.id}&output=json`
);
// .then(data => {
// if (window) {
/* if (window && window.indexedDB && idb) {
idb.set(this.id, data)
}
} */
// return data
// })
});
}

fetch() {
// Fetch returns a promise to the data, and populates the cache at this.data.
// 'load' is a more helpful wrapper that returns the whole `Volume` object itself wrapped in a promise.

// Once it has executed, you can access the data element directly, but it will often be necessary
// to wrap the fetch method in some promise resolution to work with it.

if (this.promise) {
return this.promise;
}
this.promise = this.promise_from_htrc().then((d) => {
this.data = d;
return d;
});
return this.promise;

// Defaults; likely to be overriden.
this.options = {
size: 10000,
what: "words",
doc: "page",
ntiles: 24
};
}

async load() {
await this.fetch();
return this;
}

total_word_count(options) {
// This is also in the metadata.
const loc_counts = this.word_counts(options);
const tot = Array.from(loc_counts.values()).reduce(
(left, right) => +left + right,
0
);
return tot;
}

repr() {
// Return an HTML representation of the book suitable for linking back to Hathi.
const { title, pubDate, names, imprint, volumeIdentifier } =
this.data.metadata;
return `${names.join(
"/"
)}. <em>${title}</em>. ${imprint}. <code><a href=https://babel.hathitrust.org/cgi/pt?id=${
this.id
}>${this.id}</a></code>`;
}

count(options) {
if (options.what === undefined) {
options.what = "word";
}

if (options.doc === undefined) {
options.doc = "volume";
}

const { doc } = options;

if (doc == "chunk") {
return this.chunk_counts(options);
} else if (doc == "page") {
return this.page_counts(options);
} else if (doc == "ntile") {
const total = this.total_word_count(options);
const ntile_size = Math.floor(1 + (total + 1) / options.ntiles);
// It's a little sketchy to mutate this array in-place.
// Will confuse people toggling back and forth.
const counts = this.chunk_counts(options, ntile_size);
return counts;
} else if (doc == "volume") {
return [this.word_counts(options)];
}

return "Choose an allowed count type";
}

chunk_counts(options, size) {
// Divide into continuous chunks of size 'size''.
// If 'type' == 'words', will divide into n word chunks
// If 'type' == 'pages', will divide into n/500-page chunks.

// This sets up a cache, so it would possibly to build
// a memory leak.

if (size == undefined && options.size) {
size = options.size;
}
const { chunk_cache } = this;
const { what } = options;
if (chunk_cache.get(`${what}-${size}`)) {
return chunk_cache.get(`${what}-${size}`);
}

const pages = this.page_counts(options);

let lengths = [];
let size2 = size;

//if (what == "words") {
lengths = pages.map((d) => d3.sum(Array.from(d.values())));
/*} else {
lengths = pages.map(d => 1)
}*/

const array_reducer = (a, x, i) => [...a, a.length > 0 ? x + a[i - 1] : x];
const cumulative_lengths = lengths.reduce(array_reducer, []);

const chunk_lab = cumulative_lengths.map((n) => Math.floor(n / size2));
const max_chunk = d3.max(chunk_lab);
const counts = d3.range(max_chunk + 1).map((d) => new Map());
pages.forEach((p, i) => {
merge_counts(counts[chunk_lab[i]], p);
});

chunk_cache.set(`${what}-${size}`, counts);
return counts;
}

word_counts(options) {
const pages = this.page_counts(options);
return pages.reduce(merge_counts, new Map());
}

page_counts(options) {
const { what } = options;
// Default to counting only the section.
options.sections = options.sections || ["body"];
if (!this.data) {
throw "Data must be (asynchronously) loaded; you may need to wrap your code after Volume.fetch().then(data=>{}";
}
return this.data.features.pages.map((d) => new Page(d).counts(options));
}
}
Insert cell
class Page {
// A single page in an EF volume.
constructor(data, volume) {
this.data = data
this.volume = volume
}
counts(options) {
const { what, sections, regex } = options
const page = this.data
const counts = new Map()
sections.forEach(section => {
if (page[section]===undefined) {{}}
Object.entries(page[section].tokenPosCount).forEach((o) => {
const [word, v] = o
Object.entries(v).forEach((o2) => {
const [pos, count] = o2;
if (regex && !word.match(regex)) {
return
}
if (options.pos && options.pos != pos) {
return
}
let key = word
if (what == 'pos') {
key = pos
} else if (what == 'word&pos') {
key = `${word}%&%${pos}`
} else if (what == 'lowercase') {
key = word.toLowerCase()
}
counts.set(key, (counts.get(key)|| 0) + parseInt(count))
})
})
})
return counts
}
}
Insert cell
lib.top_words(100, {what: 'lowercase'})
Insert cell

class Library {
constructor(ids = []) {
this.m = new Map()
for (const k of ids) {
this.m.set(k, new Volume(k))
}
}
/* OPERATORS: apply to chunks and update them */
word_counts(options) {
const accessor = (options.what == "word" || options.what == 'lowercase') ? d => d.word : d => '' + d.word + d.pos
const total_counts = d3.rollup(this.chunks(options), vals => d3.sum(vals.map(v => v.count)), accessor)
return total_counts
}
top_words(n, options = {'what': 'word'}) {
const counts = this.word_counts(options)
return ([...counts.entries()]).sort((a, b) => b[1] - a[1]).map(d => d[0]).slice(0, n)
}
haplax(chunks, options) {
// Tag each chunk by whether this is the only occurrence in the corpus.
const accessor = options.what == "word" ? d => d.word : d => '' + d.word + d.pos
const total_counts = d3.rollup(chunks, vals => d3.sum(vals.map(v => v.count)), accessor)
chunks.forEach(chunk => {
// Is the count here the total count? If so, this is a corpus haplax.
chunk.haplax = total_counts.get(accessor(chunk)) == chunk.count
})
return chunks
}
dunning(chunks, options) {
// Does not check for non-occurring words in each chunk.
const lib = this
const documents = d3.group(chunks, d => d.index)
const wordCounts = d3.rollup(chunks, d => d3.sum(d.map(e => e.count)), d => d.word)
const total_words = d3.sum(wordCounts.values())
const output = []
for (const [doc_index, elements] of documents) {
const c = d3.sum(elements.map(d => d.count))
const d = total_words - c
elements.forEach(element => {
// Formula from http://wordhoard.northwestern.edu/userman/analysis-comparewords.html
// How many times is this word in this chunk?
const { word, count } = element
const a = count
// How many times is the word *out* of this chunk?
const b = d3.max([1, wordCounts.get(word) - a])
const E1 = c*(a+b)/total_words
const E2 = d*(a+b)/total_words

// Bind the calculations to the object in-place
// PMI including itself in the corpus.
element.PMI = Math.log(a/E1)
// PMI compared to external corpus.
element.PMI_2 = Math.log((a/c)/(d3.max([1, b])/d))
element.dunning = 2*((a*Math.log(a/E1)) + (b*Math.log(b/E2)))
})
}
return chunks//.sort((a, b) => b.PMI*b.dunning - a.PMI*a.dunning).filter(p => p.PMI > 0)
}
tfidf(chunks, options) {
const documents = d3.group(chunks, d => d.index)
const total_words = d3.sum(chunks.map(chunk => chunk.count))
const n_documents = Array.from(documents).length;
const idf = d3.rollup(chunks, worddata => Math.log(n_documents/worddata.length), d => d.word)
for (const elements of documents.values()) {
const doc_wordcount = d3.sum(elements.map(d => d.count))
elements.forEach(element => {
const { count, word } = element;
const tf = count/doc_wordcount
element.tf = tf
element.tfidf = tf * idf.get(word)
})
}
}
PMI(chunks, options) {
// PMI and dunning are assigned at the same time.
return this.dunning(chunks, options)
}
/* Basic strategies: ccreate chunks. This is the primary method. */
chunks(options, additional_metrics = []) {
console.log(options)
const output = []
let total_index = 0
for (let [htid, volume] of this.m.entries()) {
let chunk_num = 0;
for (const chunk of volume.count(options)) {
total_index ++;
chunk_num += 1;
for (const [key, count] of chunk.entries()) {
let word = undefined
let pos = undefined
if (options.what == "word&pos") {
[word, pos] = key.split("%&%")
} else if (options.what == "word" || options.what == 'lowercase') {
word = key
} else if (options.what == "pos") {
pos = key
}
output.push({
'word': word,
'pos': pos,
'count': count,
'vol_index': chunk_num,
'index': total_index,
'htid': htid
})
}
}
}
additional_metrics.forEach(metric => this[metric](output, options))
return output
}
get(id) {
const { m } = this;
if (m.get(id) !== undefined) {
return m.get(id)
} else {
const v = new Volume(id)
m.set(id, v)
return v
}
}
fetch_all(ids = []) {
// asynchronously ensure that all books are loaded, and then return
// the library itself.
this.update(ids)
const volumes = Array
.from(this.m.values())
.map(book => book.fetch())
return Promise.all(volumes)
.then(dummy => this)
}
update(ids) {
// Quickly update the internal library with every element.
ids.forEach(id => this.get(id))
return this
}
repr() {
const books = Array.from(this.m.values())
return Promise.all(books.map(book => book.fetch()))
.then( (data) => {
const reprs = books.map(book => book.repr())
let output = "<h3>Books plotted</h3><ol>"
reprs.forEach(e => {
const element = `<li>${e}</li>`
output += `\n${element}\n`
})
output += "</ol>"
return output
})
}
}
Insert cell
function merge_counts(a, b) {
// Merge two counts dicts together into one.
// Modifies the left-hand set in-place; beware of side effects.
for (const [k, v] of b.entries()) {
a.set(k, (a.get(k) || 0) + b.get(k))
}
return a
}
Insert cell
md`# Tests

Here are some unit tests.

`
Insert cell
lib.chunks({'doc': 'ntile', 'ntiles': 100, what: 'word'}).length > lib.chunks({'doc': 'ntile', 'ntiles': 2, what: 'word'}).length
Insert cell
lib.chunks({'doc': 'chunk', 'size': 100000, 'what': 'page'})
Insert cell
// Are there 100 percentiles?
percentile_test = d3.max(lib.chunks({'doc': 'ntile', 'ntiles': 100, what: 'word'}).map(d => d.vol_index)) == 100
Insert cell
// Does it run Dunning scores successfully when asked?
dunning_exists = lib.chunks({'doc': 'ntile', 'ntiles': 20}, ["dunning", "tfidf"])[0].dunning != undefined
Insert cell
v1 = Array.from(lib.m.values())[0]
Insert cell
word_counts_over_100 = v1.total_word_count({'what': "word", 'doc': "page"}) > 100
Insert cell
md`# Outside imports`
Insert cell
cola = require("webcola@3/WebCola/cola.min.js")
Insert cell
d3 = require('d3', 'd3-fetch', 'd3-array')
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more