Public
Edited
Jul 11, 2023
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
sentences = testData
Insert cell
Insert cell
topics = getTopics(sentences, {
returnType: "object",
stemmer: false,
topics: 5
})
Insert cell
Insert cell
Insert cell
topics2 = getTopics(sentences, {
topics: 7, // generate 7 topics
minLength: 10, // only include words longer than or equal to 10 characters
stopwords: [], // skip stopwords
returnType: "object" // return an object
})
Insert cell
Insert cell
{
const vocabulary = topics2.vocab;
return html`<li>${vocabulary.join("<li>")}`;
}
Insert cell
Insert cell
topics3 = getTopics(sentences, {
topics: 7, // generate 7 topics
minLength: 10, // only include words longer than or equal to 10 characters
stopwords: [], // skip stopwords
returnType: "html" // return the **HTML table**
})
Insert cell
Insert cell
Insert cell
Insert cell
getTopics = (
sentences,
{
topics = 4,
stopwords = autoStopwords,
returnType = "object",
minLength = 2,
stemmer = autoStemmer
} = {}
) => {
var documents = new Array(),
f = {},
vocab = new Array(),
docCount = 0;

if (!Array.isArray(sentences)) sentences = sentences.split("\n");

sentences = sentences
// remove links
.map((sentence) => sentence.replace(/(?:https?|ftp):\/\/[\n\S]+/g, ""))
// remove characters
.map((sentence) => sentence.replace(/[^a-z\'A-Z ]+/g, " "))
.map((sentence) => sentence.trim())
.map((sentence) => sentence.toLowerCase())
.filter((d) => d);

sentences.forEach((sentence) => {
var words = sentence.split(/[\s,\"]+/);

if (!words) return;

var wordIndices = new Array();

words = words
.filter((word) => word)
.filter((word) => !stopwords[word])
.filter((word) => word.length >= minLength);

if (stemmer) words = words.map((word) => stemmer(word));

words.forEach((word) => {
if (f[word]) {
f[word] = f[word] + 1;
} else if (word) {
f[word] = 1;
vocab.push(word);
}

wordIndices.push(vocab.indexOf(word));
});

if (wordIndices && wordIndices.length > 0) {
documents[docCount++] = wordIndices;
}
});

var V = vocab.length,
M = documents.length,
K = topics,
alpha = 0.1, // per-document distributions over topics
beta = 0.01; // per-topic distributions over words

lda.configure(documents, V, 10000, 2000, 100, 10);
lda.gibbs(K, alpha, beta);

var theta = lda.getTheta(),
phi = lda.getPhi();

var htmlOutput = "",
data = {
topics: {},
vocab,
sentences,
documents
};

//topics
var topicText = new Array(),
topTerms = 20;

phi.forEach((p, pix) => {
topicText[pix] = "";

var tuples = new Array();
p.forEach((val, ix) => {
tuples.push("" + val.toPrecision(2) + "_" + vocab[ix]);
});
tuples.sort().reverse();
if (topTerms > vocab.length) topTerms = vocab.length;

data.topics[pix] = [];

Array(topTerms)
.fill(0)
.forEach((d, t) => {
var topicTerm = tuples[t].split("_")[1];
var prob = parseInt(tuples[t].split("_")[0] * 100);
if (prob < 0.0001) return;

data.topics[pix].push([topicTerm, prob]);
topicText[pix] += topicTerm + " ";
});
topicText[pix] = topicText[pix].trim();
});

data.topicText = topicText;

htmlOutput = '<div class="spacer"> </div>';

theta.forEach((t, ix) => {
htmlOutput += '<div class="lines">';
htmlOutput +=
'<div style="display:table-cell;width:100px;padding-right:5px">';

t.forEach((k, tix) => {
const width = parseInt("" + k * 100);
htmlOutput += `<div class="box bgcolor${tix}" style="width:${width}px" title="${topicText[tix]}"></div>`;
});

htmlOutput += `</div>${sentences[ix]}</div>`;
});

data.theta = theta;

if (returnType === "object") return data;

const style = `<style>
.box{
height: 8px;
vertical-align: middle;
margin-top: 8px;
display: table-cell;
}
.color0{ color:#FF0000;} .bgcolor0{ background-color:#FF0000;}
.color1{ color:#0000FF;} .bgcolor1{ background-color:#0000FF;}
.color2{ color:#00FF00;} .bgcolor2{ background-color:#00FF00;}
.color3{ color:#808000;} .bgcolor3{ background-color:#808000;}
.color4{ color:#00FF00;} .bgcolor4{ background-color:#00FF00;}
.color5{ color:#008000;} .bgcolor5{ background-color:#008000;}
.color6{ color:#00FFFF;} .bgcolor6{ background-color:#00FFFF;}
.color7{ color:orange;} .bgcolor7{ background-color:orange;}
.color8{ color:#800000;} .bgcolor8{ background-color:#800000;}
.color9{ color:#000080;} .bgcolor9{ background-color:#000080;}
.color10{ color:#FF00FF;} .bgcolor10{ background-color:#FF00FF;}
.color11{ color:#800080;} .bgcolor11{ background-color:#800080;}
.color12{ color:#C0C0C0;} .bgcolor12{ background-color:#C0C0C0;}

.lines {
margin: 0px;
padding: 2px;
display: table;
}
</style>
`;
return html`${style}${htmlOutput}`;
}
Insert cell
Insert cell
lda = new (function () {
var documents,
z,
nw,
nd,
nwsum,
ndsum,
thetasum,
phisum,
V,
K,
alpha,
beta,
THIN_INTERVAL = 20,
BURN_IN = 100,
ITERATIONS = 1000,
SAMPLE_LAG,
dispcol = 0,
numstats = 0;

this.configure = function (
docs,
v,
iterations,
burnIn,
thinInterval,
sampleLag
) {
this.ITERATIONS = iterations;
this.BURN_IN = burnIn;
this.THIN_INTERVAL = thinInterval;
this.SAMPLE_LAG = sampleLag;
this.documents = docs;
this.V = v;
this.dispcol = 0;
this.numstats = 0;
};

this.initialState = function (K) {
var i,
M = this.documents.length;
this.nw = make2DArray(this.V, K);
this.nd = make2DArray(M, K);
this.nwsum = makeArray(K);
this.ndsum = makeArray(M);
this.z = new Array();
for (i = 0; i < M; i++) this.z[i] = new Array();
for (var m = 0; m < M; m++) {
var N = this.documents[m].length;
this.z[m] = new Array();
for (var n = 0; n < N; n++) {
var topic = parseInt("" + Math.random() * K);
this.z[m][n] = topic;
this.nw[this.documents[m][n]][topic]++;
this.nd[m][topic]++;
this.nwsum[topic]++;
}
this.ndsum[m] = N;
}
};

this.gibbs = function (K, alpha, beta) {
var i;
this.K = K;
this.alpha = alpha;
this.beta = beta;
if (this.SAMPLE_LAG > 0) {
this.thetasum = make2DArray(this.documents.length, this.K);
this.phisum = make2DArray(this.K, this.V);
this.numstats = 0;
}
this.initialState(K);

console.log(
`Sampling ${this.ITERATIONS} iterations with burn-in of ${this.BURN_IN} (B/S=${this.THIN_INTERVAL}).`
);

for (i = 0; i < this.ITERATIONS; i++) {
for (var m = 0; m < this.z.length; m++) {
for (var n = 0; n < this.z[m].length; n++) {
var topic = this.sampleFullConditional(m, n);
this.z[m][n] = topic;
}
}
if (i < this.BURN_IN && i % this.THIN_INTERVAL == 0) {
this.dispcol++;
}
if (i > this.BURN_IN && i % this.THIN_INTERVAL == 0) {
this.dispcol++;
}
if (i > this.BURN_IN && this.SAMPLE_LAG > 0 && i % this.SAMPLE_LAG == 0) {
this.updateParams();
if (i % this.THIN_INTERVAL != 0) this.dispcol++;
}
if (this.dispcol >= 100) {
this.dispcol = 0;
}
}
};

this.sampleFullConditional = function (m, n) {
var topic = this.z[m][n];
this.nw[this.documents[m][n]][topic]--;
this.nd[m][topic]--;
this.nwsum[topic]--;
this.ndsum[m]--;
var p = makeArray(this.K);
for (var k = 0; k < this.K; k++) {
p[k] =
(((this.nw[this.documents[m][n]][k] + this.beta) /
(this.nwsum[k] + this.V * this.beta)) *
(this.nd[m][k] + this.alpha)) /
(this.ndsum[m] + this.K * this.alpha);
}
for (var k = 1; k < p.length; k++) {
p[k] += p[k - 1];
}
var u = Math.random() * p[this.K - 1];
for (topic = 0; topic < p.length; topic++) {
if (u < p[topic]) break;
}
this.nw[this.documents[m][n]][topic]++;
this.nd[m][topic]++;
this.nwsum[topic]++;
this.ndsum[m]++;
return topic;
};

this.updateParams = function () {
for (var m = 0; m < this.documents.length; m++) {
for (var k = 0; k < this.K; k++) {
this.thetasum[m][k] +=
(this.nd[m][k] + this.alpha) / (this.ndsum[m] + this.K * this.alpha);
}
}
for (var k = 0; k < this.K; k++) {
for (var w = 0; w < this.V; w++) {
this.phisum[k][w] +=
(this.nw[w][k] + this.beta) / (this.nwsum[k] + this.V * this.beta);
}
}
this.numstats++;
};

this.getTheta = function () {
var theta = new Array();
for (var i = 0; i < this.documents.length; i++) theta[i] = new Array();
if (this.SAMPLE_LAG > 0) {
for (var m = 0; m < this.documents.length; m++) {
for (var k = 0; k < this.K; k++) {
theta[m][k] = this.thetasum[m][k] / this.numstats;
}
}
} else {
for (var m = 0; m < this.documents.length; m++) {
for (var k = 0; k < this.K; k++) {
theta[m][k] =
(this.nd[m][k] + this.alpha) /
(this.ndsum[m] + this.K * this.alpha);
}
}
}
return theta;
};

this.getPhi = function () {
var phi = new Array();
for (var i = 0; i < this.K; i++) phi[i] = new Array();
if (this.SAMPLE_LAG > 0) {
for (var k = 0; k < this.K; k++) {
for (var w = 0; w < this.V; w++) {
phi[k][w] = this.phisum[k][w] / this.numstats;
}
}
} else {
for (var k = 0; k < this.K; k++) {
for (var w = 0; w < this.V; w++) {
phi[k][w] =
(this.nw[w][k] + this.beta) / (this.nwsum[k] + this.V * this.beta);
}
}
}
return phi;
};
})()
Insert cell
Insert cell
makeArray = (x) => {
var a = new Array();
for (var i = 0; i < x; i++) {
a[i] = 0;
}
return a;
}
Insert cell
make2DArray = (x, y) => {
var a = new Array();
for (var i = 0; i < x; i++) {
a[i] = new Array();
for (var j = 0; j < y; j++) a[i][j] = 0;
}
return a;
}
Insert cell
Insert cell
Insert cell
Insert cell
autoStemmer = {
const porterStemmer = await import(
"https://cdn.skypack.dev/@stdlib/nlp-porter-stemmer@0.0.7?min"
);
return porterStemmer.default;
}
Insert cell
Insert cell
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more