Set alignment / dleeftink

{

let size = 4;

let pats = Array.from({length:size},(_,i)=> new RegExp(`(${Array(i+2).join('.')})`))

let arr = [];

for (let win = 1; win < size; ++win) {

arr.push(string.split(pats[win]).concat(string.slice(1).split(pats[win])).filter(s=>s.length == win+1))

}

return arr

}

SkipList = (await import('https://esm.sh/finalskiplist@1.0.2')).default

v4_idx = 10//5338 - v4_firstN

v4_inspect = [corpus[0][0],corpus[1][0]]

v4_firstN = 1000

v4_union.heap.slice(0,v4_firstN).reduce((a,b)=>a+!!b,0)/v4_firstN

v4_union = {

let A = corpus[0]

let B = corpus[1]

let t1 = performance.now()

let C = A.concat(B)

let span = C.length;

let size = 3, dict = {}, heap = [], link = {};

//C = C.map((str,i)=>{

for (let i = 0; i < span; ++i) {

let str = process(C[i],{level:1})

heap[i] = []//new Set();

for (var win = 2; win< size; ++win){

let prev;

for (var j = 0; j < str.length-win; j++) {

const hash = str.substring(j,(j+1)+win);

//( dict[hash] ) ? ( dict[hash][i] ? ++dict[hash][i] : dict[hash][i] = 1 ) :

//( dict[hash] = {}, dict[hash][i] = 1 )

( dict[hash] ) ? dict[hash].add(i) :

( dict[hash] = new Set(), dict[hash].add(i) )

if(prev) { let tie = prev+'|'+hash; link[tie]? ++link[tie]: link[tie] = 1 }

heap[i].push(hash);

prev = hash;

}

link = Object.entries(link)

link = link.sort((a,b)=>b[1]-a[1]).slice(0,parseInt(link.length * 0.025)).map(d=>d[0].split('|')[1])

let skip = new Set(link)

let keys = Object.keys(dict)

for (var i = 0; i < keys.length; ++i) {

//dict[keys[i]] = Object.keys(dict[keys[i]])//.sort((a,b)=>.025- Math.random()).slice(0,60)

dict[keys[i]] = Array.from(dict[keys[i]])

}

// TODO: iterate on max length, swap when init < exit

// TODO: how to best splice arrays

for (var i = 0; i < A.length; ++i) {

let hash = heap[i] //Array.from(heap[i]??[]);

let freq = Array(span);

for (var j = 0; j < hash.length; ++j) {

// filters A from B, B from A

let entry = hash[j]

if(skip.has(entry)) { continue }

let locs = dict[entry].filter((ref,j)=> i < A.length ? ref >= A.length : ref < A.length ) // implicit casting

for (var n = 0; n < locs.length; ++n) {

let idx = locs[n];

freq[idx] ? ++freq[idx] : freq[idx] = 1;

}

heap[i] = //[C[i]].concat(

freq.map((d,i)=>[i,d])

.filter(d=>d[1]>1 /*&& (i < A.length ? d[0] > span*0.5 : d[0] <= span*0.5)*/)

.sort((a,b)=>b[1]-a[1]).slice(0,16)

// .map((d)=>'['+d[0]+']'+[C[d[0]],d[1]][0])

.map((d)=>parseInt(C[d[0]].split(/[\[\]]/)[1])).includes(i)

//)

}

return {

dict,heap,link,skip,

time: performance.now()-t1

}

v3_idx = parseInt(Math.random()*1000)

[...v3.heap[v3_idx].dict.keys()].map(k=>corpus[0]?.[k]??corpus[1][k])//.sort()

// keep list of ngrams appearing in multiple strings

v3 = {

let input = [...corpus[0].slice(0,1000),...corpus[1].slice(0,1000)]; //zipf.slice(0,1000)

let t1 = performance.now()

let span = input.length;

let max = 4; let dict = {}, heap = [], temp = new dw.Cache(512);

// let win = 0;

// console.clear()

let nest = input.map((str,i)=>{

str = str.replace(/[aeoiu]/g,'')

let key = i

heap[key] = new dw.Cache(5);

//let arr = []

for (var win = 3; win< max; ++win){

let sub = new Set();

for (var j = 0; j < str.length-win; ++j) {

var hash = str.substring(j,(j+1)+win);

let freq = dict[hash];

freq ? freq[key]

? ++freq[key]

: freq[key] = 1

: (freq = dict[hash] = {}, freq[key] = 1)

/* let loc = temp.get(hash);

loc ? loc[key]

? (++loc[key])

: (loc[key] = 1)

: (loc = {[key]:1}, temp.set(hash,loc))*/

sub.add(dict[hash])

//sub.add(loc)

}

for (var hash of sub) {

let loc = Object.keys(hash);

for (var n = 0; n< loc.length; ++n) {

let idx = parseInt(loc[n])

let val = heap[idx].get(key)

val ? heap[idx].set(key, ++val) : heap[idx].set(key, 1); // inverted key <--> idx improves results?

val = heap[key].get(idx)

val ? ++val : heap[key].set(key, 1)

}

return str

})

return {

dict,heap,temp,

time: performance.now()-t1

}

v2_idx = 80//parseInt(Math.random()*1000)

v2_firstN = 100

v2_inspect = [corpus[0][v2_idx],corpus[1][v2_idx]]

v2_cluster = v2.heap[v2_idx].map(idx=>corpus[1][parseInt(idx)-v2_firstN])

v2_nomatch = v2.heap.slice(0,v2_firstN).map((d,i)=>[i,d.slice(0,1).map(d=>parseInt(d)-v2_firstN).includes(i)]).filter(d=>!d[1])

v2 = {

let input = [...corpus[0].slice(0,v2_firstN),...corpus[1].slice(0,v2_firstN)]; //input.sort();

let t1 = performance.now()

let span = input.length;

let max = 3; let dict = {}, heap = [], meta = {};

// let win = 0;

// console.clear()

let nest = input.map((str,i)=>{

let key = i

// str = str.replace(/[aeoiu]/g,'')

//str = ' ' + str + ' '

heap[key] = new Set();

for (var win = 2; win< max; ++win){

//let sub = new Set();

for (var j = 0; j < str.length-win; ++j) {

var hash = str.substring(j,(j+1)+win);

//dict[hash] ? dict[hash].push(i) : dict[hash] = [i];

let loc = dict[hash];

loc ? loc[key]

? loc[key] = [i,++loc[key][1]]

: loc[key] = [i,1]

: (loc = dict[hash] = [], loc[key] = [i,1])

//sub.add(dict[hash])

heap[key].add(hash)

meta[hash] = dict[hash];

}

})

// sorting ngram arrays

// per str, unify top-n ngram arrays

// count and sort by ngram matches

// each string now has a ranking of potential str matches

// -> idea: build ranked ngram array from the get go to avoid large sorting operations

let keys = Object.keys(meta)

for (var i = 0; i < keys.length; ++i) {

// d3.quickselect(dict[keys[i]],16,0,dict[keys[i]].length,(a,b)=>b?.[1]-a?.[1])

//dict[keys[i]].sort((a,b)=>b[1]-a[1])

//fst.inPlaceSort(dict[keys[i]]).desc([k=>k[1]]).slice(0,16)

meta[keys[i]] = fst.inPlaceSort(meta[keys[i]]).desc([k=>k[1]]).slice(0,64)

}

/*heap = heap.map((h,i)=>

fst.inPlaceSort(

Object.entries(

Array.from(h).flatMap(a=> a.slice(0,128)).map(d=>d[0]).reduce((pool,item)=> (pool[item]? ++pool[item] : pool[item]=1,pool),{})

).filter(d=>d[0] >= 100 && d[1]>1)

).desc([e=>e[1]])

)*/

heap = heap.map((h,i)=>

fst.inPlaceSort(

Object.entries(

Array.from(h).flatMap(hash=>meta[hash]).reduce((pool,[key,val])=> (pool[key]? ++pool[key] : pool[key]=1,pool),{})

).filter(d=>d[0] >= span*0.5 && d[1]>2)

).desc([e=>e[1]])

)

return {

dict,heap,meta,

time: performance.now()-t1

}

d3.quickselect([[1,1],,[2,2],[3,1]],2,0,4,(a,b)=>b?.[1]-a?.[1])

v1 = {

let input = zipf.slice(0,1000)

let t1 = performance.now()

let span = input.length;

let max = 3; let dict = {}, heap = [];

// let win = 0;

console.clear()

let nest = input.map((str,i)=>{

//str = ' ' + str + ' '

heap[i] = [] //Array(span) //new Set();

for (var win = 1; win< max; ++win){

let sub = new Set();

for (var j = 0; j < str.length-win; ++j) {

var hash = str.substring(j,(j+1)+win);

//dict[hash] ? dict[hash].push(i) : dict[hash] = [i];

let loc = dict[hash];

loc ? loc[i]

? loc[i] += 1 //= [i,++loc[i][1]]

: loc[i] = 1//[i,1]

: (loc = dict[hash] = {}, loc[i] = 1 /*[i,1]*/)

sub.add(dict[hash])

// heap[i].push(dict[hash])

// heap[i].push(structuredClone(dict[hash]))

}

for (var hash of sub) {

let loc = Object.keys(hash);

for (var n = 0; n< Math.min(loc.length); ++n) {

let heapData = heap[i][loc[n]];

heapData ? ++heapData /*+= hash[loc[n]]*/ : heapData = heap[i][loc[n]] = 1 //hash[loc[n]]

heap[loc[n]][i] = heapData

}

//heap[i] = sub

//[...sub].map(hash=> dict[hash].forEach((h,n)=> heap[i][n] ? heap[i][n] += h : heap[i][n]= h))

}

return str

})

/* let keys = Object.keys(dict)

for (var i = 0; i < keys.length; ++i) {

dict[keys[i]].sort((a,b)=>a-b)

let hash = Object.keys(dict[keys[i]])

for (var j = 0; j < hash.length; ++j) {

if(dict[keys[i]][hash[j]] !== dict[keys[i]][parseInt(hash[j])+1] ) {

delete dict[keys[i]][hash[j]]

} else { continue }

}

}*/

/*for (var hash in dict) {

dict[hash].sort((a,b)=>a-b)//(a,b)=>b[1]-a[1])

for (var idx in dict[hash]) {

if(dict[hash][idx] !== dict[hash][parseInt(idx)+1]) {

delete dict[hash][idx]

}

}*/

/*let keys = Object.keys(dict)

for (var i = 0; i < keys.length; ++i) {

let hash = Object.keys(dict[keys[i]])

for (var j = 0; j < hash.length; ++j) {

if(dict[keys[i]][hash[j]] == 1) {

delete dict[keys[i]][hash[j]]

} else { continue }

}

}*/

//heap = heap.map(d=>[...d].map(n=>n[0]).sort((a,b)=>b[1]-a[1]))//n.slice(0,2)))

/*for (var hash in dict) {

for (var idx in dict[hash]) {

if(dict[hash][idx] < 3 ){

delete dict[hash][idx]

} else { continue }

}

}*/

// heap = heap.map(mat=>mat.map(a=>a.filter(x=>x>1)))

// heap = heap.map(h=>h.flatMap(n=>Object.keys(n)).sort())

// heap = heap.map(h=>h.sort((a,b)=>b-a))

return {

dict,heap,

time: performance.now()-t1

}

/*for (let n = 0; n < loc.length; ++n) {

// heap[i][loc[n]] ? heap[i][loc[n]] += 0 : heap[i][loc[n]] =1

heap[i]

? heap[i][loc[n]]

? heap[i][loc[n]] += 1

: heap[i][loc[n]] = 1

: heap[i] = [];

}*/

// from https://gist.github.com/matsadler/6087275

kdtree = {

function kdtree(dimensions, points, depth) {

depth = depth || 0;

var axis = depth % dimensions,

node = { axis: axis };

points.sort(function (a, b) {

return a[axis] - b[axis];

});

var lefts = points.splice(0, Math.floor(points.length / 2));

node.value = points.shift();

depth += 1;

if (lefts.length) {

node.left = kdtree(dimensions, lefts, depth);

}

if (points.length) {

node.right = kdtree(dimensions, points, depth);

}

return node;

}

return kdtree;

}

// from https://gist.github.com/matsadler/6087275

kdnear = {

// returns the k nearest points to target as an array

// k should be the number of points to return

// kdtree should be a kdtree

// target should be an array of points

// results shouldn't be passes, it's used internally

// example:

// var closest = nearest(1, tree, [1,1]).first;

function nearest(k, kdtree, target, results) {

results = results || [];

var axis = kdtree.axis,

value = kdtree.value,

result = {

value: value,

distance: distanceSquare(target, value),

},

comparison,

unsearched,

axisDistance,

maxDistance;

if (!kdtree.left && !kdtree.right) {

return priorityQueueAdd(distanceCompare, k, results, result);

}

comparison = target[axis] - value[axis];

if (comparison < 0) {

unsearched = kdtree.right;

if (kdtree.left) {

nearest(k, kdtree.left, target, results);

}

} else if (comparison > 0) {

unsearched = kdtree.left;

if (kdtree.right) {

nearest(k, kdtree.right, target, results);

}

} else {

if (kdtree.left) {

nearest(k, kdtree.left, target, results);

}

if (kdtree.right) {

nearest(k, kdtree.right, target, results);

}

axisDistance = axisDistanceSquare(axis, value, target);

if (results.length) {

maxDistance = results[results.length - 1].distance;

}

if (unsearched && (!maxDistance || axisDistance < maxDistance)) {

nearest(k, unsearched, target, results);

}

return priorityQueueAdd(distanceCompare, k, results, result);

}

// helper functions

function distanceCompare(a, b) {

return a.distance - b.distance;

}

function distanceSquare(a, b) {

var sum = 0,

i;

for (i = 0; i < a.length; i += 1) {

sum += axisDistanceSquare(i, a, b);

}

return sum;

}

function axisDistanceSquare(axis, a, b) {

return Math.pow(a[axis] - b[axis], 2);

}

// the following helper functions are pretty generic and could be extracted to

// their own module

// modifies queue

function priorityQueueAdd(cmpFunc, max, queue, value) {

queue = insertSorted(cmpFunc, queue, value);

if (queue.length > max) {

queue.pop();

}

return queue;

}

// modifies array

function insertSorted(cmpFunc, array, value) {

if (!array.length) {

return array.push(value);

}

var pivotComparison = index(cmpFunc, array, value),

pivot = pivotComparison[0],

comparison = pivotComparison[1];

if (0 < comparison) {

array.splice(pivot, 0, value);

} else {

array.splice(pivot + 1, 0, value);

}

return array;

}

// returns an index and a signal value as an array, eg [index, signal]

// if the signal is 0 then value === array[index], if signal is < 0 then

// value < array[index] && value > array[index - 1], if signal is > 0 then

// value > array[index] && value < array[index + 1]

function index(cmpFunc, array, value) {

var start = 0,

length = array.length,

finish = length,

pivot,

comparison;

while ((length = finish - start) > 0) {

pivot = start + Math.floor(length / 2);

comparison = cmpFunc(array[pivot], value);

if (0 > comparison) {

start = pivot + 1;

} else if (0 === comparison) {

break;

} else {

finish = pivot;

}

return [pivot, comparison];

}

return nearest

}

function hashWords(words, hash) {

for (var i=0; i<words.length; i++) {

hash.update(words[i]);

}

fastCount = {

let init = corpus[0].slice(0,5000)

let span = init.length

let dict = {}

let t1= performance.now()

let r;

for (let i = 0; i < span; ++i) {

let o = {}

let w = init[i].split(' ')

//dict[i] = o;

for (let j = 0;j < w.length;++j){

o[w[j]] = 1

//let s = w[j]

// dict[s]?

// dict[s]++:dict[s] = 1

}

//dict[i] = o

}

return {

dict,

time: performance.now()-t1

}

Purpose-built for displays of data