Public
Edited
Jul 19, 2024
Paused
2 forks
13 stars
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
viewof drSelection = {
clusters;
maxClusters;
return BrushableScatterPlot(dataToPlot, {
color: colorBy,
size: "score",
interactive,
colorScheme,
tooltip: [
"title",
"award",
"score",
"track",
"firstSessionName",
"authorNames",
"abstract"
]
});
}
Insert cell
dataToPlot
Insert cell
Insert cell
Insert cell
Insert cell
paperClicked = drSelection.clicked
Insert cell
papersHighlighted = drSelection.brushed
Insert cell
sessionsToAttend = {
query;
semanticSearchResults;
return d3
.groups(
navioSelected.filter((d) => d.score > minScore/100),
// (v) => v.length,
(d) => d.firstTimestamp.toLocaleDateString(...dateOptions),
(d) =>
`${d.firstTimestamp.toLocaleTimeString(...dateOptions)} ${d.firstRoomName} - ${d.firstSessionName}`
)
.sort((a, b) => d3.ascending(a[0], b[0]))
.map(([g, v]) => [g, v.sort((a, b) => d3.ascending(a[0], b[0]))]);
}
Insert cell
dateOptions = ["en-US", { timeZone: "utc" }]
Insert cell
roomsToGo = {
query;
semanticSearchResults;
return d3
.groups(
navioSelected.filter((d) => d.score > minScore / 100),
// (v) => v.length,
(d) => d.firstTimestamp.toLocaleDateString(...dateOptions),
(d) => `${d.firstRoomName}`
// (d) => d.title
)
.sort((a, b) => d3.ascending(a[0], b[0]))
.map(([g, v]) => [g, v.sort((a, b) => b[1].length - a[1].length)]);
}
Insert cell
Insert cell
viewof maxPapersSpinner = Inputs.bind(htl.html`<input type="number" style="width: 3rem" min=1 value=10>`, viewof maxPapers)
Insert cell
Insert cell
Insert cell
Insert cell
// navio(embeddings.map(d => d.data), {attribWidth: 2, height: 600})
Insert cell
Insert cell
Insert cell
Insert cell
computeEmbeddings && DOM.download(new Blob(
[JSON.stringify(embeddings_computed)],
{type: "application/json"}
), "embeddings.json", "Download Embeddings")
Insert cell
Insert cell
viewof embeddings_computed = {
if (computeEmbeddings) {
const before = performance.now();
restart;
const values = [];
const bar = progress({ interval: 50, invalidation });
yield bar;
let i =0;
for (let row of sample) {
values.push({id: row.id,
embedding: await extractor(embeddingAccessor(row), extractorOptions)
});
await bar.progress(i++, sample.length);
// console.log("i", i);
}
console.log("progress completed in", performance.now() - before);
bar.resolve(values);
} else {
// Not computing embeddgins, return nothing
yield pleaseEnableComputeEmbeddings();
}
}
Insert cell
embeddings = {
if (computeEmbeddings) {
return embeddings_computed;
} else {
return embeddings_cached;
}
}
Insert cell
embeddings_cached = (
embeddingsFileSelected === "Title and Abstract May 13" ?
(await FileAttachment("chi2024_embeddingsTitleAbstract_May13.json.zip").zip()) .file("chi2024_embeddingsTitleAbstract_May13.json")
:
(await FileAttachment("chi2024papers-all-componentized_embeddings.json.zip").zip()).file("chi2024papers-all-componentized_embeddings.json")
)
.json()
.then((res) => {
for (let row of res) {
row.embedding.data = new Float32Array(
Array.from(Object.values(row.embedding.data))
);
// newRes.push({
// data: new Float32Array(Array.from(Object.values(row.data)))
// });
}
return res;
})
Insert cell
viewof embeddingsFileSelected = Inputs.select(
[
"Title and Abstract May 13",
"Joel's componentized_embeddings",
],
{ label: "Procomputed embeddings file"}
)
Insert cell
chi2024papersAllComponentized_embeddingsJson = FileAttachment("chi2024papers-all-componentized_embeddings.json.zip").zip()
Insert cell
embeddingAccessor = (d) =>
// [d.paper_type].join("\n\n")
computeEmbeddingsUsing.map(a => d[a]).join("\n\n")
Insert cell
embeddingsHash = {
console.log("computing embHash");
return new Map(embeddings.map((d, i) => [d.id, d.embedding]))
}
Insert cell
Insert cell
function renderItem(p) {
return htl.html`
<div style="width: 150px; flex: 1; padding-right: 10px; padding-bottom: 15px">
<strong><a href=${p.url}>${p.title}</a></strong>
<div>Similarity score: ${(p.score * 100).toFixed(2)}% ${
p.awards ? p.awards : ""
}</div>
<div style="font-style: italic; max-height: 4em; overflow: auto;">${p.authorsExpanded
.map((a) => `${a.firstName} ${a.lastName}`)
.join(", ")}</div>
<div>${p.track} - ${p.firstSessionName}</div>
<div style="margin-top: 0.5em; max-height: 70px; overflow: auto">${
p.abstract
}</div>
</div>
`;
}
Insert cell
facetedSelected = facetedSelectedPhase1.filter(
(d) =>
tracksSelected.includes(d.track) &&
(!authorsSelected.length ||
authorsSelected.some((a) => d.authorsList.includes(a))) &&
(!affiliationsSelected.length ||
affiliationsSelected.some((a) => d.affiliationsList.includes(a.toLocaleLowerCase())))
)
Insert cell
function pleaseEnableComputeEmbeddings() {
return Object.assign(htl.html`Please enable compute embeddings`, {
value: []
});
}
Insert cell
sample = navioSelected
Insert cell
attrs = Object.keys(papers[0]).concat(["score"])
Insert cell
Insert cell
reducerResultComputed = {
embeddings;
console.log(
"Launching DR Worker",
[...embeddingsHash.keys()].length,
sample.length,
sample.at(-1),
tracksSelected
);
if (useCachedDR) return [];
restartDR;

return DruidGenerator(
sample.map((d) => embeddingsHash.get(d.id).data),
druid_method,
druid_params
);

}
Insert cell
reducerResult = {
console.log("got reducerResult", sample.length)
return useCachedDR
? reducerResultCached
: Object.fromEntries(
sample.map((d, id) => [d.id, reducerResultComputed[id]])
)
}
Insert cell
// DOWNLOAD UMAP
sample.map((d, i) => {
const [x, y] = reducerResultComputed[i];
return ({id: d.id, x, y });
})
Insert cell
reducerResultCached = FileAttachment("chi2024_umapTitleAbstract_may13.csv")
.csv({ typed: true })
.then((res) => Object.fromEntries(res.map((d) => [d.id, [d.x, d.y]])))
Insert cell
dataToPlot = {
// Recompute data to plot when clustering;
computeClusters;
maxClusters;

return (semanticSearchResults ? semanticSearchResults : navioSelected).map(
(row, id) => ({
...row,
x: reducerResult[row.id] && reducerResult[row.id][0],
y: reducerResult[row.id] && reducerResult[row.id][1]
})
);
}
Insert cell
Insert cell
semanticSearchResults = cosineSimilarity(query, sample)

Insert cell
// https://github.com/xenova/transformers.js/blob/main/examples/semantic-image-search-client/src/app/worker.js#L46
async function cosineSimilarity(query, data) {
if (!query) {
for (let row of data) {
row.score = 0;
}
return;
}

const query_embeds = (await extractor(query, extractorOptions)).data;

// const database_embeds = sample.map((d) => embeddingsHash.get(d.id).data);

for (let row of data) {
const dbVector = embeddingsHash.get(row.id)?.data;
if (dbVector) {
row.score = pairCosineSimilarity(query_embeds, dbVector);
} else {
console.log("no embedding found for", row.id);
}
}

return data;
}
Insert cell
// Computes the cosine similarity between two embeddings

function pairCosineSimilarity(embeddingA, embeddingB) {
const EMBED_DIM = embeddingA.length;
let dotProduct = 0;
let normEmbeds = 0;
let normDB = 0;

for (let j = 0; j < EMBED_DIM; ++j) {
const embedValue = embeddingA[j];
const dbValue = embeddingB[j];

dotProduct += embedValue * dbValue;
normEmbeds += embedValue * embedValue;
normDB += dbValue * dbValue;
}

const score = dotProduct / (Math.sqrt(normEmbeds) * Math.sqrt(normDB));
return score;
}
Insert cell
import {
DruidGenerator,
viewof druid_method,
viewof druid_params
} with { default_method } from "@john-guerra/druidjs-generator"
Insert cell
default_method = "UMAP"
Insert cell
import {conditionalShow} from "@john-guerra/conditional-show"
Insert cell
import {FacetedSearch} from "@john-guerra/faceted-search"
Insert cell
import {progress} from "@mootari/displaying-progress"
Insert cell
import {navio} from "@john-guerra/navio"
Insert cell
import {scentedCheckbox} from "@john-guerra/scented-checkbox"
Insert cell
import { BrushableScatterPlot } from "@john-guerra/brushable-scatterplot";
Insert cell
transformers = import("https://cdn.jsdelivr.net/npm/@xenova/transformers@2.17.1")
Insert cell
Insert cell
extractor = await transformers.pipeline(
"feature-extraction",
"Xenova/all-MiniLM-L6-v2",
// "allenai/specter2",
// { quantize: false }
extractorOptions
)
Insert cell
extractorOptions = ({
pooling: "mean",
quantize: false,
// normalize: true,
// quantize: true,
// precision: "binary"
})
Insert cell
papers = chiConference.contents.map((d) => {
d.id = +d.id;
d.sessions = d.sessionIds.map((s) => {
const session = maps.get("sessions").get(s);
session.room = maps.get("rooms").get(session.roomId);
session.timeSlot = maps.get("timeSlots").get(session.timeSlotId);
return session;
});
d.firstSessionName = d.sessions[0].name;
d.firstRoomName = d.sessions[0].room?.name;
d.firstTimestamp = new Date(d.sessions[0].timeSlot?.startDate);
d.firstSessionNameWord = d.sessions[0].name.split(" ")[0];
d.track = maps.get("tracks").get(d.trackId).name;
d.authorsExpanded = d.authors.map((a) => maps.get("people").get(a.personId));
d.authorsList = d.authorsExpanded.map((a) =>
`${a.firstName?.trim()} ${a.lastName?.trim()}`.toLocaleLowerCase()
);
d.authorNames = d.authorsList.join(", ");
d.affiliationsList = [
...new Set(
d.authors
.map((a) =>
a.affiliations.map((f) => f.institution.trim().toLocaleLowerCase())
)
.flat()
)
];
d.url = `https://programs.sigchi.org/chi/2024/program/content/${d.id}`;


return d;
})
Insert cell
maps = {
const attrs = [
// "conference",
// "publicationInfo",
"sponsors",
"sponsorLevels",
"floors",
"rooms",
"tracks",
"contentTypes",
"timeSlots",
"sessions",
"events",
"contents",
"people",
"recognitions"
];

const maps = new Map(
attrs.map((a) => [a, new Map(chiConference[a].map((d) => [d.id, d]))])
);
return maps;
}
Insert cell
chiConference = (await FileAttachment("CHI_2024_program_may13.json.zip").zip()).file("CHI_2024_program_may13.json").json()
Insert cell
Insert cell
kmeans = import('https://cdn.skypack.dev/ml-kmeans@6.0.0?min')
Insert cell
clusters = {
if (!computeClusters) return null;
console.log("Computing clusters");
const before = performance.now();
const clusters = kmeans.kmeans(
embeddings.map((d) => d.embedding.data),
maxClusters
);
clusters.clusters.map((c, i) => {
navioSelected[i].cluster = c;
});
console.log("🤡 clusters computed", (performance.now() - before) / 1000);
return clusters;
}
Insert cell
clusterPapers = {
if (!computeClusters) return [];
const indexes = clusters.clusters.reduce((p, d, i) => {
if (d === cluster) p.push(i);
return p;
}, []);
return indexes.map((i) => papers[i]);
}
Insert cell
viewof cluster = Inputs.range([0, maxClusters], {step: 1})
Insert cell
Inputs.table(clusterPapers)
Insert cell
authors = [...new Set(papers.map(d => d.authorsList).flat())].sort()
Insert cell
affiliations = [...new Set(papers.map(d => d.authors.map( a=> a.affiliations[0]?.institution.trim().toLocaleLowerCase())).flat())].sort()
Insert cell
viewof initialAuthors = {
const widget = await PersistInput(
"authors",
Inputs.checkbox(authors, { maxHeight: 100 })
);
widget.style.height = "100px";
widget.style.overflow = "scroll";
return widget;
}
Insert cell
viewof initialAffiliations = {
const widget = PersistInput(
"affiliations",
Inputs.checkbox(affiliations, { maxHeight: "100px" })
);
widget.style.height = "100px";
widget.style.overflow = "scroll";
return widget;
}
Insert cell
import {multiAutoSelect} from "@john-guerra/multi-auto-select@262"
Insert cell
import {PersistInput} from '@john-guerra/persist-input'

Insert cell
Insert cell
Clustering = import('https://cdn.skypack.dev/hdbscanjs@1.0.12?min').then(res => res.default)
Insert cell
clusteredTree = {
if (!computeClusters) return {};
const dataset = Object.entries(reducerResult).map(([k, v]) => ({
data: v,
id: k
}));

// two distance measure functions are supported:
// 1) euclidean
// 2) geoDist (take inputs as lonlat points)
const distFunc = Clustering.distFunc.euclidean;

const before = performance.now();
const cluster = new Clustering(dataset, distFunc);
const treeNode = cluster.getTree();
console.log("HDBScan run in ", performance.now() - before);

return treeNode;

// const filtered = treeNode.filter((val) => val.data.length === 2, {
// minX: 0,
// maxX: width,
// minY: 0,
// maxY: innerHeight
// });
// filtered.forEach((x) => console.log(x.toString()));

// const filterFunc = val => ...;
// const bbox = {minX:.., maxX:.., minY:.., maxY:..};
// const filteredNodes = treeNode.filter(filterFunc, bbox);
}
Insert cell
Object.entries(reducerResult).map(([k, v]) => ({
data: v,
id: k
}));
Insert cell
function convertTree(t) {
let children = [];
if (t?.left) children = [convertTree(t.left)];
if (t?.right) children = [...children, convertTree(t.right)];
return {
...t,
children
};
}
Insert cell
convertTree(clusteredTree)
Insert cell
Tree(convertTree(clusteredTree), { height: 500, width, fit: true, r: 0.5 })
Insert cell
import {Tree} from "@john-guerra/tree-fit"
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more