Public
Edited
Nov 1, 2023
Insert cell
Insert cell
viewof form = Inputs.form({
data: Inputs.textarea({
label: "Some Text Data to Make Language Models",
value: `@AlexanderMills Search for an ascii table - you can see that only characters that has value from zero to 127 are valid. (0x7F is 127 in hex). This code matches all characters that are not in the ascii range and removes them. – `
}),
ngrams: Inputs.range([1, 5], { label: "Ngram", step: 1 }),
smoothing_model: Inputs.radio(["Laplace", "Kenser Ney"], {
label: "Smoothing Model",
value: "Kenser Ney"
})
})
Insert cell
tokens = tokenize(form.data)
Insert cell
function tokenize(text) {
const tokens = text
.replace(/[^A-Za-z ]/g, "")
.toLowerCase()
.split(/[ ]+/)
.filter((x) => x != "");
return ["[BOS]", ...tokens, "[EOS]"];
}
Insert cell
get_ngram_counts(tokens, form.ngrams)
Insert cell
function get_ngram_counts(tokens, K = 5) {
const ngram_counts = Array(K)
.fill(1)
.map((x) => new Map());

const trie = new TrieNode();
for (const ngrams of get_ngrams(tokens, (K = K))) {
for (const x of ngrams) {
const ngram = x[0];
const l = x[1] - 1;
const key = ngram.split(" ");
trie.increment(key);
// const l = ngram.length - 1;
if (!ngram_counts[l].has(ngram)) {
ngram_counts[l].set(ngram, 0);
}
const c = ngram_counts[l].get(ngram);
ngram_counts[l].set(ngram, c + 1);
}
}
return [ngram_counts, trie];
}
Insert cell
new Map().set([1, 2], 2)
Insert cell
[...get_ngrams(["hello", "world", "hello", "world"])]
Insert cell
function* get_ngrams(tokens, K = 5) {
for (let i = 0; i < tokens.length; i++) {
let ngrams = [[tokens[i], 1]];
for (let j = 1; j < K && i + j < tokens.length; j++) {
ngrams.push([`${ngrams[ngrams.length - 1][0]} ${tokens[i + j]}`, j + 1]);
}
yield ngrams;
}
}
Insert cell
trie.get([1, 2, 3])
Insert cell
trie.increment([1, 2, 3], 5)
Insert cell
trie = new TrieNode()
Insert cell
class TrieNode {
constructor() {
this.key = null;
this.children = new Map();
this.value = null;
}
set(key, value) {
let node = this;
for (const k of key) {
if (!node.children.has(k)) {
node.children.set(k, new TrieNode());
}
node = node.children.get(k);
}
node.value = value;
}

increment(key) {
let node = this;
for (const k of key) {
node.value = (node.value === null ? 0 : node.value) + 1;
if (!node.children.has(k)) {
node.children.set(k, new TrieNode());
}
node = node.children.get(k);
}
node.value = (node.value === null ? 0 : node.value) + 1;
}

get(key) {
let node = this;
for (const k of key) {
if (!node.children.has(k)) {
return null;
}
node = node.children.get(k);
}
return node.value;
}

has(key) {
let node = this;
for (const k of key) {
if (!node.children.has(k)) {
return false;
}
node = node.children.get(k);
}
return true;
}
}
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more