class Library {
constructor(ids = []) {
this.m = new Map()
for (const k of ids) {
this.m.set(k, new Volume(k))
}
}
word_counts(options) {
const accessor = (options.what == "word" || options.what == 'lowercase') ? d => d.word : d => '' + d.word + d.pos
const total_counts = d3.rollup(this.chunks(options), vals => d3.sum(vals.map(v => v.count)), accessor)
return total_counts
}
top_words(n, options = {'what': 'word'}) {
const counts = this.word_counts(options)
return ([...counts.entries()]).sort((a, b) => b[1] - a[1]).map(d => d[0]).slice(0, n)
}
haplax(chunks, options) {
const accessor = options.what == "word" ? d => d.word : d => '' + d.word + d.pos
const total_counts = d3.rollup(chunks, vals => d3.sum(vals.map(v => v.count)), accessor)
chunks.forEach(chunk => {
chunk.haplax = total_counts.get(accessor(chunk)) == chunk.count
})
return chunks
}
dunning(chunks, options) {
// Does not check for non-occurring words in each chunk.
const lib = this
const documents = d3.group(chunks, d => d.index)
const wordCounts = d3.rollup(chunks, d => d3.sum(d.map(e => e.count)), d => d.word)
const total_words = d3.sum(wordCounts.values())
const output = []
for (const [doc_index, elements] of documents) {
const c = d3.sum(elements.map(d => d.count))
const d = total_words - c
elements.forEach(element => {
// Formula from http://wordhoard.northwestern.edu/userman/analysis-comparewords.html
// How many times is this word in this chunk?
const { word, count } = element
const a = count
// How many times is the word *out* of this chunk?
const b = d3.max([1, wordCounts.get(word) - a])
const E1 = c*(a+b)/total_words
const E2 = d*(a+b)/total_words
// Bind the calculations to the object in-place
// PMI including itself in the corpus.
element.PMI = Math.log(a/E1)
// PMI compared to external corpus.
element.PMI_2 = Math.log((a/c)/(d3.max([1, b])/d))
element.dunning = 2*((a*Math.log(a/E1)) + (b*Math.log(b/E2)))
})
}
return chunks//.sort((a, b) => b.PMI*b.dunning - a.PMI*a.dunning).filter(p => p.PMI > 0)
}
tfidf(chunks, options) {
const documents = d3.group(chunks, d => d.index)
const total_words = d3.sum(chunks.map(chunk => chunk.count))
const n_documents = Array.from(documents).length;
const idf = d3.rollup(chunks, worddata => Math.log(n_documents/worddata.length), d => d.word)
for (const elements of documents.values()) {
const doc_wordcount = d3.sum(elements.map(d => d.count))
elements.forEach(element => {
const { count, word } = element;
const tf = count/doc_wordcount
element.tf = tf
element.tfidf = tf * idf.get(word)
})
}
}
PMI(chunks, options) {
// PMI and dunning are assigned at the same time.
return this.dunning(chunks, options)
}
/* Basic strategies: ccreate chunks. This is the primary method. */
chunks(options, additional_metrics = []) {
console.log(options)
const output = []
let total_index = 0
for (let [htid, volume] of this.m.entries()) {
let chunk_num = 0;
for (const chunk of volume.count(options)) {
total_index ++;
chunk_num += 1;
for (const [key, count] of chunk.entries()) {
let word = undefined
let pos = undefined
if (options.what == "word&pos") {
[word, pos] = key.split("%&%")
} else if (options.what == "word" || options.what == 'lowercase') {
word = key
} else if (options.what == "pos") {
pos = key
}
output.push({
'word': word,
'pos': pos,
'count': count,
'vol_index': chunk_num,
'index': total_index,
'htid': htid
})
}
}
}
additional_metrics.forEach(metric => this[metric](output, options))
return output
}
get(id) {
const { m } = this;
if (m.get(id) !== undefined) {
return m.get(id)
} else {
const v = new Volume(id)
m.set(id, v)
return v
}
}
fetch_all(ids = []) {
// asynchronously ensure that all books are loaded, and then return
// the library itself.
this.update(ids)
const volumes = Array
.from(this.m.values())
.map(book => book.fetch())
return Promise.all(volumes)
.then(dummy => this)
}
update(ids) {
// Quickly update the internal library with every element.
ids.forEach(id => this.get(id))
return this
}
repr() {
const books = Array.from(this.m.values())
return Promise.all(books.map(book => book.fetch()))
.then( (data) => {
const reprs = books.map(book => book.repr())
let output = "<h3>Books plotted</h3><ol>"
reprs.forEach(e => {
const element = `<li>${e}</li>`
output += `\n${element}\n`
})
output += "</ol>"
return output
})
}
}