Codetations backend: methods for plain text location anchoring / Edward Misback

Edward Misback

Live programming @ University of Washington

Workspace

Public

Edited

Dec 8, 2022

{

viewof annT2.value = `

<1>const newT2rev = <2>newT2.split('').reverse()</2>.join('')</1>

<3>const isOpen = t => t.tag[0][1] !== '/'</3>;

const pairMapping = newTags.filter(isOpen)

.map(<4>t1 => ({open: t1,

close: newTags

.find(<6>(t2, j) => <7>t2.tag[0].slice(2,-1)</7> === t1.tag[0].slice(1,-1)</6>)})</4> )// ...

viewof newT2.value = oldT2

mutable mutT2 = oldT2

mutable mutAnnT2 = annT2

}

diff = require('diff')

d = {

const out = new diff.Diff(true)

out.tokenize = function(value) {

// tokenize usage described at https://github.com/kpdecker/jsdiff/issues/63

// word tokenize example at https://github.com/kpdecker/jsdiff/blob/3b654c2ed7d5262ed9946de841ad8dae990286c7/src/diff/word.js#L34

const extendedWordChars = /^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;

// All whitespace symbols except newline group into one token, each newline - in separate token

// NOTE this can be tuned

//Join the boundary splits that we do not consider to be boundaries. This is primarily the extended Latin character set.

// for (let i = 0; i < tokens.length - 1; i++) {

// // If we have an empty string in the next field and we have only word chars before and after, merge

// if (!tokens[i + 1] && tokens[i + 2]

// && extendedWordChars.test(tokens[i])

// && extendedWordChars.test(tokens[i + 2])) {

// tokens[i] += tokens[i + 2];

// tokens.splice(i + 1, 2);

// i--;

// }

return tokens;

};

return out

}

lineDiff = new diff.Diff(true)

newT2

diff.diffLines(oldT2, newT2, {newlineIsToken: true})

addIsNewline = ds => {

const result = []

for (let i = 0; i < ds.length; i++) {

const newline = ds[i].added && (!ds[i-1] || ds[i-1]?.value?.endsWith('\n')) && (ds[i].value.endsWith('\n') || (!ds[i+1] || ds[i+1]?.value?.startsWith('\n')))

result.push({...ds[i], newline})

}

return result

}

// addIsNewlineP = ps => {

// const result = []

// for (let i = 0; i < ps.length; i++) {

// const newline = ps[i].added && (!ps[i-1] || ps[i-1]?.[0]?.endsWith('\n')) && (ps[i][0].endsWith('\n') || (!ps[i+1] || ps[i+1]?.[0].startsWith('\n')))

// result.push({...ps[i], newline})

// }

// return result

// }

''.endsWith

diffs2Patches(addIsNewline(d.diff(oldT2, newT2)))

d.diff(oldT2, newT2)

d.diff(oldT2, annT2)

diffs2Patches(d.diff(oldT, newT1))

diffs2Patches = ds => {

ds = ds.map((d, i) => ({...d, idx: i})).sort((a, b) => !((a.removed && b.added) || (b.removed && a.added)) ? 0 : Math.abs(a.idx - b.idx) !== 1 ? 0 : d.added ? -1 : 1) // make sure added stuff comes first for adjacent patches

// NOTEs

// for .removed, the output index must refer to the starting position in the string from which stuff should be removed

return ds.reduce(([acc, delta], d) => {

const d1 = d.added ? 0 : d.value.length

const index = d.removed ? delta + d1 : delta // removals are indexed by the range's final index

return [[...acc, {0: d.value, newline: d.newline, index, added: d.added, removed: d.removed, drop: !(d.added || d.removed)}], delta + d1]

}, [[], 0])[0].filter(o => !o.drop)

}

"abcdefGhi.Lmn".split(/(.)/)

patches(oldT, newT1)

diffs2Patches(d.diff(oldT, newT))

patches(oldT, newT1)

merge(oldT, patches(oldT, annT), patches(oldT, newT))

merge(oldT, patches2(oldAnn), patches2(oldNew))

mutable mutT2 = oldT2

mutable mutAnnT2 = annT2

//merge(mutable mutT2, patches(mutable mutT2, mutable mutAnnT2), patches(mutable mutT2, newT2))

{

mutable mutAnnT2 = merge2(mutable mutT2, mutable mutAnnT2, newT2)

mutable mutT2 = newT2

}

patches(oldT2, annT2)

patches(oldT2, newT2)

oldT.length

patches = (old, current) => diffs2Patches(addIsNewline(d.diff(old, current)))

applyDelta = (s, d) => {

const text = d[0]//.slice(2, -2)

const length = text.length

if (d.added) {

return s.slice(0, d.index) + text + s.slice(d.index)

} else {

return s.slice(0, d.index - length) + s.slice(d.index)

}

{

const t0 = performance.now()

const out = merge2(oldT2, annT2, newT2)

const t1 = performance.now()

return [t1 - t0, out]

}

clean = s => {

// moves tags around in the string according to some rules:

// * closing tags should never be preceded by a newline/whitespace/open {paren/brace/bracket}, and move back around these characters

// * opening tags should never be followed by (reverse of stuff above)

const closingTags = s.matchAll(/<\/\d+>/g)

for (const tag of closingTags) {

let idx = tag.index

while (s[idx-1].match(/[\r\n\s\(\[\{]/)) {

s = s.slice(0, idx-1) + tag[0] + s[idx-1] + s.slice(idx + tag[0].length)

idx = idx-1

console.log(s)

}

const openingTags = [...s.matchAll(/<\d+>/g)]

for (const tag of openingTags.sort((a,b) => -(a.index - b.index))) {

let idx = tag.index

while (s[idx + tag[0].length].match(/[\r\n\s\)\]\}]/)) {

s = s.slice(0, idx) + s[idx + tag[0].length] + tag[0] + s.slice(idx + tag[0].length + 1)

idx = idx + 1

console.log(s)

}

return s

}

merge2(oldT2, annT2, newT2)

s = merge1(oldT2, annT2, newT2)

{

function swap(s, idx1, idx2) {

const copy = s.split('')

copy[idx1] = s[idx2]

copy[idx2] = s[idx1]

return copy.join('')

}

let s1 = s

for (const tag of s1.matchAll(/<\/\d+>/g)) {

let idx = tag.index

while (s1[idx-1].match(/[\r\n\s\(\[\{]/)) {

s1 = s1.slice(0, idx-1) + tag[0] + s1[idx-1] + s1.slice(idx + tag[0].length)

idx = idx-1

}

return s1

}

merge2 = (s, ann, newT) => clean(merge1(s, ann, newT))

merge1 = (s, s1, s2) => merge(s, patches(s, s1), patches(s, s2))

merge = (s, ds1, ds2) => {

// Go from back to front, applying removals first when there are conflicts

console.log('start merge', s, ds1, ds2)

const order = (a, b) => {

const diff = a.index - b.index

if (diff) {return -diff } // reverse overall order

if (a.added && b.added) {

console.log(a, b, a.left ? b.left ? 0 : (a[0][1] === '/' ? 0 : 1) : 0)

return a.newline ? 1 : b.newline ? -1 // make sure added lines end up applied last

: a.left ? b.left ? 0 : (a[0][1] === '/' ? 0 : 1) : 0}

if (a.removed === b.removed) {return -1} // TODO preserve order within removals/additions

if (a.removed) {return -1} // Apply removals first

return 1 // Flip if order is a:removed, b:added

}

const sorted = [...ds1.map(d => ({...d, left: true})), ...ds2.map(d => ({...d, right: true}))].sort(order)

console.log(sorted)

// Move annotations that occur in a removed section back to the start of the section.

const reindexed = sorted.reduce(([acc, removeStartIdx], d) => {

let index = d.index;

if (removeStartIdx === d.index) {

removeStartIdx = undefined

}

if (d.removed) {

console.log('HERE', d)

removeStartIdx = d.index - d[0].length

}

else if (removeStartIdx !== undefined && removeStartIdx < d.index) {

index = removeStartIdx

}

return [[...acc, {...d, index}], removeStartIdx]

}, [[], undefined])[0].sort(order)

console.log(reindexed)

return reindexed.reduce((acc, d) => {

const out = applyDelta(acc, d)

console.log(d[0], d.index, out)

return out

}, s)

}

distance = require('jaro-winkler')

oldT2

newT2

annT2

merge2(oldT2, annT2, newT2)

merge3(annT2, newT2)

merge4(annT2, newT2)[0]

//clean(out[0])

out = {

const t0 = performance.now()

const out = doubleSided && merge5(annT2, newT2)

return out

}

merge5 = (annT2, newT2) => {

if (newT2.length > 500) return ['aborting to avoid performance issue']

// try "looking left and right", since this algo ostensibly likes matching prefixes

// also assess range fit

const tagRE = /<\/?\d+>/g

const tags = [...annT2.matchAll(tagRE)]

const reindexed = tags.reduce(([acc, idx], t) => [[...acc, {...t, index: t.index - idx}], idx + t[0].length], [[], 0])[0]

let newTags = []

let N = 20

const M = 3 // context for range matching

let nSpaces = n => new Array(n).map(v => '').join(' ')

const context = (s, i, k) => (s + nSpaces(k)).slice(i, i + k)

const pad2sides = (s, k) => (nSpaces(k) + s + nSpaces(k))

const oldT2 = annT2.replaceAll(tagRE, '')

const oldT2rev = oldT2.split('').reverse().join('')

for (const tag of reindexed) {

let idx = tag.index

newTags.push({contextR: context(oldT2, idx, N), contextL: context(oldT2rev, oldT2.length - idx, N), tag})

// while (s1[idx-1].match(/[\r\n\s\(\[\{]/)) {

// s1 = s1.slice(0, idx-1) + tag[0] + s1[idx-1] + s1.slice(idx + tag[0].length)

// idx = idx-1

// }

}

newTags = newTags.map((t, i) => ({...t, idx: i}))

//return newTags

//return distance("mn(opQrs)\n ", context(newT2, 5, N))

const newT2rev = newT2.split('').reverse().join('')

const isOpen = t => t.tag[0][1] !== '/';

const pairMapping = newTags.filter(isOpen).map(t1 => ({open: t1, close: newTags.find((t2, j) => t2.tag[0].slice(2,-1) === t1.tag[0].slice(1,-1))}) ).map(({open, close}) => [...new Array(newT2.length)].map((_,i) => [...new Array(newT2.length)].map((_,j) => [[i,j], j < i ? -1 : distance(pad2sides(oldT2, M).slice(open.tag.index, close.tag.index), pad2sides(newT2, M).slice(i, j))])))

const fullmapping1 = newTags.map((t) => {

return [t.tag[0],

[...new Array(newT2.length)].map((_,i) => {

// whoops, we need to recover tagIdx

const pairIdx = newTags.filter(isOpen).findIndex(t1 => t1.tag[0].slice(1,-1) === t.tag[0].slice(-2,-1))

return (distance(t.contextR, context(newT2, i, N)) * .3 +

distance(t.contextL, context(newT2rev, newT2.length - i, N)) * .3 +

distance(t.contextR.slice(0, 3), context(newT2, i, 3)) * .2 +

distance(t.contextL.slice(0, 3), context(newT2rev, newT2.length - i, 3)) * .2 +

/*(isOpen(t) ? pairMapping[pairIdx][i].reduce((sum, v, k) => sum + v[1], 0) :

pairMapping[pairIdx].reduce((sum, v, k) => sum + v[i][1], 0)) *(1/25) * .4 + */

(isOpen(t) ? pairMapping[pairIdx][i].reduce(([j, max], v, k) => [v[1] > max ? k : j, v[1] > max ? v[1]: max], [-1, -1])[1] :

pairMapping[pairIdx].reduce(([j, max], v, k) => [v[i][1] > max ? k : j, v[i][1] > max ? v[i][1]: max], [-1, -1])[1]) * .4)

})]

})

//return fullmapping1

const fullmapping = fullmapping1.map(([t, out]) => [t, out.reduce(([i, max], v, j) => [v > max ? j : i, v > max ? v : max], [-1, null])])

//return fullmapping

const mapping = fullmapping.map(m => [m[0], m[1][0]])

//return mapping

const out = mapping.sort((a, b) => b[1] - a[1] !== 0 ? b[1] - a[1] : a[0][-2] < b[0][-2] ? -1 : 1).reduce((acc, [t, i]) => acc.slice(0, i) + t + acc.slice(i), newT2)

return [out, fullmapping]

}

merge4 = (annT2, newT2) => { // try "looking left and right", since this algo likes matching prefixes

if (newT2.length > 100) return ['aborting to avoid performance issue']

const tagRE = /<\/?\d+>/g

const tags = [...annT2.matchAll(tagRE)]

const reindexed = tags.reduce(([acc, idx], t) => [[...acc, {...t, index: t.index - idx}], idx + t[0].length], [[], 0])[0]

const newTags = []

let N = 30

let nSpaces = n => new Array(n).map(v => '').join(' ')

const context = (s, i, k) => (s + nSpaces(k)).slice(i, i + k).trim()

const oldT2 = annT2.replaceAll(tagRE, '')

const oldT2rev = oldT2.split('').reverse().join('')

for (const tag of reindexed) {

let idx = tag.index

newTags.push({contextR: context(oldT2, idx, N), contextL: context(oldT2rev, oldT2.length - idx, N), tag})

// while (s1[idx-1].match(/[\r\n\s\(\[\{]/)) {

// s1 = s1.slice(0, idx-1) + tag[0] + s1[idx-1] + s1.slice(idx + tag[0].length)

// idx = idx-1

// }

}

//return newTags

//return distance("mn(opQrs)\n ", context(newT2, 5, N))

const newT2rev = newT2.split('').reverse().join('')

const fullmapping = newTags.map(t => [t.tag[0], [...new Array(newT2.length)].map((_,i) => distance(t.contextR, context(newT2, i, N)) + distance(t.contextL, context(newT2rev, newT2.length - i, N))).reduce(([i, max], v, j) => [v > max ? j : i, v > max ? v : max], [-1, null])])

const mapping = fullmapping.map(m => [m[0], m[1][0]])

//return mapping

const out = mapping.sort((a, b) => b[1] - a[1] !== 0 ? b[1] - a[1] : a[0][-2] < b[0][-2] ? -1 : 1).reduce((acc, [t, i]) => acc.slice(0, i) + t + acc.slice(i), newT2)

return [out, fullmapping]

}

merge3 = (annT2, newT2) => {

if (newT2.length > 1000) return ['aborting to avoid performance issue']

const tagRE = /<\/?\d+>/g

const tags = [...annT2.matchAll(tagRE)]

const reindexed = tags.reduce(([acc, idx], t) => [[...acc, {...t, index: t.index - idx}], idx + t[0].length], [[], 0])[0]

const newTags = []

let N = 20

let nSpaces = n => new Array(n).map(v => '').join(' ')

const context = (s, i, k) => (nSpaces(k) + s + nSpaces(k)).slice(i, i + k*2).trim()

for (const tag of reindexed) {

let idx = tag.index

newTags.push({context: context(annT2.replaceAll(tagRE, ''), idx, N), tag})

// while (s1[idx-1].match(/[\r\n\s\(\[\{]/)) {

// s1 = s1.slice(0, idx-1) + tag[0] + s1[idx-1] + s1.slice(idx + tag[0].length)

// idx = idx-1

// }

}

//return newTags

//return distance("mn(opQrs)\n ", context(newT2, 5, N))

const mapping = newTags.map(t => [t.tag[0], [...new Array(newT2.length)].map((_,i) => distance(t.context, context(newT2, i, N))).reduce(([i, max], v, j) => [v > max ? j : i, v > max ? v : max], [-1, null])[0]])

const out = mapping.sort((a, b) => b[1] - a[1] !== 0 ? b[1] - a[1] : a[0][-2] < b[0][-2] ? -1 : 1).reduce((acc, [t, i]) => acc.slice(0, i) + t + acc.slice(i), newT2)

return out

}

oldNew = 'abcdefGhi.Lmn([-opQrs-]{+A + B+})'

patches2(oldAnn)

oldAnn = '{+<3><1>+}abcdefGhi{+</1>+}.Lmn({+<2>+}opQrs{+</2>+}){+</3>+}'

patches2(oldNew)

m = import('https://unpkg.com/three-way-merge@0.1.0/dist/modules/src/index.js?module')

patches2(oldAnn)

applyDelta(oldT, patches2(oldAnn)[0])

applyDelta(oldT, patches2(oldNew)[0])

patches2(oldNew)

patches2(oldAnn)[0][0]

patches2 = s => [...s.matchAll(/[{\[][+-].*?[\]}]/g)].reduce(([acc, delta], o) => {

const d1 = o[0].length

// index should be the index of the character the edit precedes

return [[...acc, ({...o, 0: o[0].slice(2, -2), added: o[0][1] === '+', removed: o[0][1] !== '+', newIndex: o.index, index: o.index - delta})], delta + d1]

}, [[], 0])[0]

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.

Learn more