Public
Edited
Sep 11, 2023
Paused
1 fork
1 star
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
{
let size = 4;
let pats = Array.from({length:size},(_,i)=> new RegExp(`(${Array(i+2).join('.')})`))
let arr = [];
for (let win = 1; win < size; ++win) {
arr.push(string.split(pats[win]).concat(string.slice(1).split(pats[win])).filter(s=>s.length == win+1))
}
return arr
}
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
SkipList = (await import('https://esm.sh/finalskiplist@1.0.2')).default
Insert cell
v4_idx = 10//5338 - v4_firstN
Insert cell
v4_inspect = [corpus[0][0],corpus[1][0]]
Insert cell
v4_firstN = 1000
Insert cell
Insert cell
Insert cell
v4_union.heap.slice(0,v4_firstN).reduce((a,b)=>a+!!b,0)/v4_firstN
Insert cell
v4_union = {

let A = corpus[0]
let B = corpus[1]
let t1 = performance.now()
let C = A.concat(B)
let span = C.length;
let size = 3, dict = {}, heap = [], link = {};
//C = C.map((str,i)=>{
for (let i = 0; i < span; ++i) {
let str = process(C[i],{level:1})
heap[i] = []//new Set();
for (var win = 2; win< size; ++win){

let prev;
for (var j = 0; j < str.length-win; j++) {
const hash = str.substring(j,(j+1)+win);
//( dict[hash] ) ? ( dict[hash][i] ? ++dict[hash][i] : dict[hash][i] = 1 ) :
//( dict[hash] = {}, dict[hash][i] = 1 )

( dict[hash] ) ? dict[hash].add(i) :
( dict[hash] = new Set(), dict[hash].add(i) )

if(prev) { let tie = prev+'|'+hash; link[tie]? ++link[tie]: link[tie] = 1 }
heap[i].push(hash);
prev = hash;
}
}
}
link = Object.entries(link)
link = link.sort((a,b)=>b[1]-a[1]).slice(0,parseInt(link.length * 0.025)).map(d=>d[0].split('|')[1])
let skip = new Set(link)

let keys = Object.keys(dict)
for (var i = 0; i < keys.length; ++i) {
//dict[keys[i]] = Object.keys(dict[keys[i]])//.sort((a,b)=>.025- Math.random()).slice(0,60)
dict[keys[i]] = Array.from(dict[keys[i]])
}

// TODO: iterate on max length, swap when init < exit
// TODO: how to best splice arrays
for (var i = 0; i < A.length; ++i) {
let hash = heap[i] //Array.from(heap[i]??[]);
let freq = Array(span);

for (var j = 0; j < hash.length; ++j) {
// filters A from B, B from A
let entry = hash[j]
if(skip.has(entry)) { continue }
let locs = dict[entry].filter((ref,j)=> i < A.length ? ref >= A.length : ref < A.length ) // implicit casting

for (var n = 0; n < locs.length; ++n) {
let idx = locs[n];
freq[idx] ? ++freq[idx] : freq[idx] = 1;

}
}
heap[i] = //[C[i]].concat(
freq.map((d,i)=>[i,d])
.filter(d=>d[1]>1 /*&& (i < A.length ? d[0] > span*0.5 : d[0] <= span*0.5)*/)
.sort((a,b)=>b[1]-a[1]).slice(0,16)
// .map((d)=>'['+d[0]+']'+[C[d[0]],d[1]][0])
.map((d)=>parseInt(C[d[0]].split(/[\[\]]/)[1])).includes(i)
//)
}
return {
dict,heap,link,skip,
time: performance.now()-t1
}
}
Insert cell
Insert cell
v3_idx = parseInt(Math.random()*1000)
Insert cell
[...v3.heap[v3_idx].dict.keys()].map(k=>corpus[0]?.[k]??corpus[1][k])//.sort()
Insert cell
// keep list of ngrams appearing in multiple strings

v3 = {
let input = [...corpus[0].slice(0,1000),...corpus[1].slice(0,1000)]; //zipf.slice(0,1000)
let t1 = performance.now()
let span = input.length;
let max = 4; let dict = {}, heap = [], temp = new dw.Cache(512);
// let win = 0;
// console.clear()
let nest = input.map((str,i)=>{
str = str.replace(/[aeoiu]/g,'')
let key = i

heap[key] = new dw.Cache(5);
//let arr = []
for (var win = 3; win< max; ++win){
let sub = new Set();
for (var j = 0; j < str.length-win; ++j) {
var hash = str.substring(j,(j+1)+win);
let freq = dict[hash];
freq ? freq[key]
? ++freq[key]
: freq[key] = 1
: (freq = dict[hash] = {}, freq[key] = 1)

/* let loc = temp.get(hash);
loc ? loc[key]
? (++loc[key])
: (loc[key] = 1)
: (loc = {[key]:1}, temp.set(hash,loc))*/

sub.add(dict[hash])
//sub.add(loc)
}

for (var hash of sub) {
let loc = Object.keys(hash);
for (var n = 0; n< loc.length; ++n) {
let idx = parseInt(loc[n])
let val = heap[idx].get(key)
val ? heap[idx].set(key, ++val) : heap[idx].set(key, 1); // inverted key <--> idx improves results?

val = heap[key].get(idx)
val ? ++val : heap[key].set(key, 1)
}
}
}
return str
})

return {
dict,heap,temp,
time: performance.now()-t1
}
}
Insert cell
Insert cell
v2_idx = 80//parseInt(Math.random()*1000)
Insert cell
v2_firstN = 100
Insert cell
v2_inspect = [corpus[0][v2_idx],corpus[1][v2_idx]]
Insert cell
v2_cluster = v2.heap[v2_idx].map(idx=>corpus[1][parseInt(idx)-v2_firstN])
Insert cell
v2_nomatch = v2.heap.slice(0,v2_firstN).map((d,i)=>[i,d.slice(0,1).map(d=>parseInt(d)-v2_firstN).includes(i)]).filter(d=>!d[1])
Insert cell
v2 = {
let input = [...corpus[0].slice(0,v2_firstN),...corpus[1].slice(0,v2_firstN)]; //input.sort();
let t1 = performance.now()
let span = input.length;
let max = 3; let dict = {}, heap = [], meta = {};
// let win = 0;
// console.clear()
let nest = input.map((str,i)=>{

let key = i
// str = str.replace(/[aeoiu]/g,'')
//str = ' ' + str + ' '
heap[key] = new Set();
for (var win = 2; win< max; ++win){
//let sub = new Set();
for (var j = 0; j < str.length-win; ++j) {
var hash = str.substring(j,(j+1)+win);
//dict[hash] ? dict[hash].push(i) : dict[hash] = [i];
let loc = dict[hash];
loc ? loc[key]
? loc[key] = [i,++loc[key][1]]
: loc[key] = [i,1]
: (loc = dict[hash] = [], loc[key] = [i,1])
//sub.add(dict[hash])
heap[key].add(hash)
meta[hash] = dict[hash];
}
}
})

// sorting ngram arrays
// per str, unify top-n ngram arrays
// count and sort by ngram matches
// each string now has a ranking of potential str matches
// -> idea: build ranked ngram array from the get go to avoid large sorting operations

let keys = Object.keys(meta)
for (var i = 0; i < keys.length; ++i) {
// d3.quickselect(dict[keys[i]],16,0,dict[keys[i]].length,(a,b)=>b?.[1]-a?.[1])
//dict[keys[i]].sort((a,b)=>b[1]-a[1])
//fst.inPlaceSort(dict[keys[i]]).desc([k=>k[1]]).slice(0,16)
meta[keys[i]] = fst.inPlaceSort(meta[keys[i]]).desc([k=>k[1]]).slice(0,64)
}

/*heap = heap.map((h,i)=>
fst.inPlaceSort(
Object.entries(
Array.from(h).flatMap(a=> a.slice(0,128)).map(d=>d[0]).reduce((pool,item)=> (pool[item]? ++pool[item] : pool[item]=1,pool),{})
).filter(d=>d[0] >= 100 && d[1]>1)
).desc([e=>e[1]])
)*/

heap = heap.map((h,i)=>
fst.inPlaceSort(
Object.entries(
Array.from(h).flatMap(hash=>meta[hash]).reduce((pool,[key,val])=> (pool[key]? ++pool[key] : pool[key]=1,pool),{})
).filter(d=>d[0] >= span*0.5 && d[1]>2)
).desc([e=>e[1]])
)

return {
dict,heap,meta,
time: performance.now()-t1
}
}
Insert cell
d3.quickselect([[1,1],,[2,2],[3,1]],2,0,4,(a,b)=>b?.[1]-a?.[1])
Insert cell
v1 = {
let input = zipf.slice(0,1000)
let t1 = performance.now()
let span = input.length;
let max = 3; let dict = {}, heap = [];
// let win = 0;
console.clear()
let nest = input.map((str,i)=>{
//str = ' ' + str + ' '
heap[i] = [] //Array(span) //new Set();
for (var win = 1; win< max; ++win){
let sub = new Set();
for (var j = 0; j < str.length-win; ++j) {
var hash = str.substring(j,(j+1)+win);
//dict[hash] ? dict[hash].push(i) : dict[hash] = [i];
let loc = dict[hash];
loc ? loc[i]
? loc[i] += 1 //= [i,++loc[i][1]]
: loc[i] = 1//[i,1]
: (loc = dict[hash] = {}, loc[i] = 1 /*[i,1]*/)

sub.add(dict[hash])
// heap[i].push(dict[hash])
// heap[i].push(structuredClone(dict[hash]))
}
for (var hash of sub) {
let loc = Object.keys(hash);
for (var n = 0; n< Math.min(loc.length); ++n) {
let heapData = heap[i][loc[n]];
heapData ? ++heapData /*+= hash[loc[n]]*/ : heapData = heap[i][loc[n]] = 1 //hash[loc[n]]
heap[loc[n]][i] = heapData
}
}
//heap[i] = sub
//[...sub].map(hash=> dict[hash].forEach((h,n)=> heap[i][n] ? heap[i][n] += h : heap[i][n]= h))
}
return str
})

/* let keys = Object.keys(dict)
for (var i = 0; i < keys.length; ++i) {
dict[keys[i]].sort((a,b)=>a-b)
let hash = Object.keys(dict[keys[i]])
for (var j = 0; j < hash.length; ++j) {
if(dict[keys[i]][hash[j]] !== dict[keys[i]][parseInt(hash[j])+1] ) {
delete dict[keys[i]][hash[j]]
} else { continue }
}
}*/
/*for (var hash in dict) {
dict[hash].sort((a,b)=>a-b)//(a,b)=>b[1]-a[1])
for (var idx in dict[hash]) {
if(dict[hash][idx] !== dict[hash][parseInt(idx)+1]) {
delete dict[hash][idx]
}
}
}*/
/*let keys = Object.keys(dict)
for (var i = 0; i < keys.length; ++i) {
let hash = Object.keys(dict[keys[i]])
for (var j = 0; j < hash.length; ++j) {
if(dict[keys[i]][hash[j]] == 1) {
delete dict[keys[i]][hash[j]]
} else { continue }
}
}*/
//heap = heap.map(d=>[...d].map(n=>n[0]).sort((a,b)=>b[1]-a[1]))//n.slice(0,2)))
/*for (var hash in dict) {
for (var idx in dict[hash]) {
if(dict[hash][idx] < 3 ){
delete dict[hash][idx]
} else { continue }
}
}*/

// heap = heap.map(mat=>mat.map(a=>a.filter(x=>x>1)))
// heap = heap.map(h=>h.flatMap(n=>Object.keys(n)).sort())
// heap = heap.map(h=>h.sort((a,b)=>b-a))
return {
dict,heap,
time: performance.now()-t1
}
}
Insert cell
/*for (let n = 0; n < loc.length; ++n) {
// heap[i][loc[n]] ? heap[i][loc[n]] += 0 : heap[i][loc[n]] =1
heap[i]
? heap[i][loc[n]]
? heap[i][loc[n]] += 1
: heap[i][loc[n]] = 1
: heap[i] = [];
}*/

Insert cell
// from https://gist.github.com/matsadler/6087275

kdtree = {
function kdtree(dimensions, points, depth) {
depth = depth || 0;
var axis = depth % dimensions,
node = { axis: axis };

points.sort(function (a, b) {
return a[axis] - b[axis];
});
var lefts = points.splice(0, Math.floor(points.length / 2));

node.value = points.shift();

depth += 1;
if (lefts.length) {
node.left = kdtree(dimensions, lefts, depth);
}
if (points.length) {
node.right = kdtree(dimensions, points, depth);
}

return node;
}

return kdtree;
}
Insert cell
// from https://gist.github.com/matsadler/6087275

kdnear = {
// returns the k nearest points to target as an array
// k should be the number of points to return
// kdtree should be a kdtree
// target should be an array of points
// results shouldn't be passes, it's used internally
// example:
// var closest = nearest(1, tree, [1,1]).first;
function nearest(k, kdtree, target, results) {
results = results || [];
var axis = kdtree.axis,
value = kdtree.value,
result = {
value: value,
distance: distanceSquare(target, value),
},
comparison,
unsearched,
axisDistance,
maxDistance;
if (!kdtree.left && !kdtree.right) {
return priorityQueueAdd(distanceCompare, k, results, result);
}

comparison = target[axis] - value[axis];
if (comparison < 0) {
unsearched = kdtree.right;
if (kdtree.left) {
nearest(k, kdtree.left, target, results);
}
} else if (comparison > 0) {
unsearched = kdtree.left;
if (kdtree.right) {
nearest(k, kdtree.right, target, results);
}
} else {
if (kdtree.left) {
nearest(k, kdtree.left, target, results);
}
if (kdtree.right) {
nearest(k, kdtree.right, target, results);
}
}

axisDistance = axisDistanceSquare(axis, value, target);
if (results.length) {
maxDistance = results[results.length - 1].distance;
}
if (unsearched && (!maxDistance || axisDistance < maxDistance)) {
nearest(k, unsearched, target, results);
}

return priorityQueueAdd(distanceCompare, k, results, result);
}

// helper functions

function distanceCompare(a, b) {
return a.distance - b.distance;
}

function distanceSquare(a, b) {
var sum = 0,
i;
for (i = 0; i < a.length; i += 1) {
sum += axisDistanceSquare(i, a, b);
}
return sum;
}

function axisDistanceSquare(axis, a, b) {
return Math.pow(a[axis] - b[axis], 2);
}

// the following helper functions are pretty generic and could be extracted to
// their own module

// modifies queue
function priorityQueueAdd(cmpFunc, max, queue, value) {
queue = insertSorted(cmpFunc, queue, value);
if (queue.length > max) {
queue.pop();
}
return queue;
}

// modifies array
function insertSorted(cmpFunc, array, value) {
if (!array.length) {
return array.push(value);
}
var pivotComparison = index(cmpFunc, array, value),
pivot = pivotComparison[0],
comparison = pivotComparison[1];
if (0 < comparison) {
array.splice(pivot, 0, value);
} else {
array.splice(pivot + 1, 0, value);
}
return array;
}

// returns an index and a signal value as an array, eg [index, signal]
// if the signal is 0 then value === array[index], if signal is < 0 then
// value < array[index] && value > array[index - 1], if signal is > 0 then
// value > array[index] && value < array[index + 1]
function index(cmpFunc, array, value) {
var start = 0,
length = array.length,
finish = length,
pivot,
comparison;

while ((length = finish - start) > 0) {
pivot = start + Math.floor(length / 2);
comparison = cmpFunc(array[pivot], value);
if (0 > comparison) {
start = pivot + 1;
} else if (0 === comparison) {
break;
} else {
finish = pivot;
}
}

return [pivot, comparison];
}
return nearest
}
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
function hashWords(words, hash) {
for (var i=0; i<words.length; i++) {
hash.update(words[i]);
}
}
Insert cell
Insert cell
Insert cell
fastCount = {

let init = corpus[0].slice(0,5000)
let span = init.length
let dict = {}
let t1= performance.now()
let r;
for (let i = 0; i < span; ++i) {
let o = {}
let w = init[i].split(' ')

//dict[i] = o;
for (let j = 0;j < w.length;++j){
o[w[j]] = 1

//let s = w[j]
// dict[s]?
// dict[s]++:dict[s] = 1
}
//dict[i] = o
}

return {
dict,
time: performance.now()-t1
}
}
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more