Public
Edited
Oct 7, 2023
1 star
Insert cell
Insert cell
Insert cell
quickFuzz = {

let t1 = performance.now()
let keys = Object.keys(ents.dict);
let vals = Object.values(ents.dict);

let clip = 32;
let span = keys.length;
let dict = {};
let freq = new QuickSet({span,slot:8,freq:1,fifo:false})

for (let i = 0; i < span; ++i) {
let data = vals[i];
let size = data.length
for (let j = 0; j < size; ++j) {
let nums = data[j]
let iter = nums.length
// if (iter > clip) continue
for (let n = 0; n < iter; ++n) {
freq.minsum(nums[n])
}
}
let word = keys[i]
dict[word] = freq.top().filter(d=>d[0]!== i && d[1]>0).map(d=>keys[d[0]])
freq.clear(true)

}

return {
t: performance.now()-t1 + ents.t,
dict
}
}
Insert cell
ents = {
let t1 = performance.now();
let data = corpusEN[0];

let span = data.length;
let bits = 4,
divs = 1 / bits;
let calc = (span) => bits * Math.ceil(span * divs);

let refs = {};
let dict = {};
let init = Array.from({ length: span }, () => []);

// create basic word dictionary
for (let i = 0; i < span; ++i) {
let line = data[i].split(" ");
let size = line.length;
for (let j = 0; j < size; ++j) {
let word = line[j];
if (word.length < 2) continue;
let list = refs[word];
list ? list.push(i) : (refs[word] = list = [i]);

init[i].push(list)
}
}

// prepare dictionary for numeric encoding
let flat = Array.from({ length: span }, () => "");
let keys = Object.keys(refs);
//.sort((a,b)=> Math.abs(6-a.length) - Math.abs(6-b.length))
//.sort((a,b)=>a.length - b.length || a.localeCompare(b))
span = keys.length;

for (let i = 0; i < span; ++i) {
let line = keys[i];

// transliterate non-ascii chars
line = transliterable(line) ? encodeURIComponent(line) : line;

// playing with below parameters seem to improve/decrease fuzzy cluster quality
// TO DO better padding rules
// line = line.padEnd(calc(line.length),' ')
let copy = line;
for (let j = 0; j < bits; ++j) {
line += " ".repeat(j + 1) + copy;
}
line = line.padEnd(calc(line.length), " ");

flat[i] = line;
}

// concatenate keys for one-shot encoding
// might not be te best strategy for isolated terms

let pads = bits * 0;
let mark = String.fromCodePoint(0).repeat(pads);

let rope = flat.join(mark) + mark;
let size = rope.length;

let [view, exit] = byteLense(size, bits);
let utf8 = new TextEncoder();
utf8.encodeInto(rope, exit);

let freq = new Map();
let from, till = 0, ceil = Math.ceil;

// build word-to-word index
for (let i = 0; i < span; ++i) {
let word = keys[i];
dict[word] = [];

from = till;
till += ceil((flat[i].length + pads) * divs);

for (let n = from; n < till; ++n) {
let hash = view[n];
let data = freq.get(hash);
data
? data.at(-1) == i ? null : data.push(i)
: freq.set(hash, (data = [i]));

dict[word].push(data);
}
}

return {
t: performance.now() - t1,
dict,
refs,
init,
};
}

Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more