Published
Edited
Dec 11, 2020
Insert cell
md`# Tokenize text and generate n-grams`
Insert cell
viewof data = form(html`<form>
<div>
<label><input name="min_n" type="text" value=2> <i>min_n</i></label>
<label><input name="max_n" type="text" value=3> <i>max_n</i></label>
</div>
<div>
<textarea name="text" cols=100>This is a great work @jack wow.</textarea>
</div>
</form>`)
Insert cell
data
Insert cell
Array.from(new Set(ngrams));
Insert cell
Insert cell
Insert cell
ngrams = tokenize("This is a great work @jack wow");
Insert cell
Insert cell
Array.from({length:max_n-min_n+1},(v,k)=>k+min_n)
Insert cell

function get_ngrams(chunks, max_n=3, min_n=2){
const nvalues = Array.from({length:max_n-min_n+1},(v,k)=>k+min_n)
const ngrams = chunks.flatMap(chunk => {
const ng = chunk.flatMap((x,i) => {
return nvalues
.filter((j) => (i+j < chunk.length))
.map((j) => {
return chunk.slice(i, j+i)
.map(x => x.toString()).join(" ").toLowerCase()
});
});
return ng;
});
return ngrams;
}
Insert cell
function tokenize(text) {
let tokens = [
...text.matchAll(
/(?<url>[\w]+:\/\/[\S]+)|(?<mention>@\w+)|(?<word>[\w]+)|(?<space>[\s\n]+)|(?<noword>((?!\s)[\W])+)/g
)
];
tokens = tokens.map(({input, ...t}) => {
return { ...t, end_index: t.index + t[0].length };
});
tokens = tokens.filter(t => !t.groups.space)
const chunks = get_chunks(tokens);
const ngrams = get_ngrams(chunks, max_n, min_n);
return Array.from(new Set(ngrams));
}
Insert cell
import {form} from "@mbostock/form-input"
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more