Published
Edited
Dec 11, 2020
Insert cell
md`# Tokenize text and generate n-grams`
Insert cell
viewof data = form(html`<form>
<div>
<label><input name="min_n" type="text" value=2> <i>min_n</i></label>
<label><input name="max_n" type="text" value=3> <i>max_n</i></label>
</div>
<div>
<textarea name="text" cols=100>This is a great work @jack wow.</textarea>
</div>
</form>`)
Insert cell
data
Insert cell
Array.from(new Set(ngrams));
Insert cell
Insert cell
Insert cell
ngrams = tokenize("This is a great work @jack wow");
Insert cell
Insert cell
Array.from({length:max_n-min_n+1},(v,k)=>k+min_n)
Insert cell

function get_ngrams(chunks, max_n=3, min_n=2){
const nvalues = Array.from({length:max_n-min_n+1},(v,k)=>k+min_n)
const ngrams = chunks.flatMap(chunk => {
const ng = chunk.flatMap((x,i) => {
return nvalues
.filter((j) => (i+j < chunk.length))
.map((j) => {
return chunk.slice(i, j+i)
.map(x => x.toString()).join(" ").toLowerCase()
});
});
return ng;
});
return ngrams;
}
Insert cell
function tokenize(text) {
let tokens = [
...text.matchAll(
/(?<url>[\w]+:\/\/[\S]+)|(?<mention>@\w+)|(?<word>[\w]+)|(?<space>[\s\n]+)|(?<noword>((?!\s)[\W])+)/g
)
];
tokens = tokens.map(({input, ...t}) => {
return { ...t, end_index: t.index + t[0].length };
});
tokens = tokens.filter(t => !t.groups.space)
const chunks = get_chunks(tokens);
const ngrams = get_ngrams(chunks, max_n, min_n);
return Array.from(new Set(ngrams));
}
Insert cell
import {form} from "@mbostock/form-input"
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more