Published
Edited
Apr 16, 2021
2 stars
Insert cell
md`# BERT Wordpiece Tokenizer
Tokenizer code based on: https://www.npmjs.com/package/bert-tokenizer
`
Insert cell
viewof text = Textarea({
label: "Text",
placeholder: "Shubhanshu Mishra went to Allahabad. And then came to USA.",
value: "Shubhanshu Mishra went to Allahabad. And then came to USA."
})
Insert cell
bertTokenizer.tokenizer.separator
Insert cell
subwordTokens
.reduce(
(accumulator, currentValue) =>
accumulator + (non_hf_format ? "" : " ") + currentValue,
"TEXT:"
)
.replaceAll(bertTokenizer.tokenizer.separator, ' ')
.trim()
Insert cell
subwordTokens
.slice(0, subwordTokens.length - 1)
.concat([subwordTokens[subwordTokens.length - 1]])
Insert cell
subwordTokens.reduce((accumulator, currentValue) => {
// console.log(accumulator, currentValue);
let lastItem = [currentValue];
let offset = 0;
if (currentValue.slice(0, 2) == "##") {
lastItem = accumulator[accumulator.length - 1].concat([currentValue]);
offset = 1;
}
// console.log("lastItem", lastItem);
return accumulator.slice(0, accumulator.length - offset).concat([lastItem]);
}, [])
Insert cell
subwordTokens = bertTokenizer.convertIdsToTokens(tokenIds)
Insert cell
tokenIds = bertTokenizer.tokenize(text)
Insert cell
bertTokenizer.convertSingleExample(text)
Insert cell
vocab = loadVocabTxt(HF_TXT_VOCAB, non_hf_format)
Insert cell
bertTokenizer = {
const tokenizer = new BertTokenizer(vocab, {doLowerCase: doLowerCase, non_hf_format: non_hf_format});
return tokenizer;
}
Insert cell
non_hf_format = false
Insert cell
doLowerCase = HF_TXT_VOCAB.indexOf("uncased") >= 0 ? true : false
Insert cell
HF_TXT_VOCAB = "https://gist.githubusercontent.com/napsternxg/3bcb38983e32d5b8036eb0b9593a694e/raw/fa15e1f327fbcd23d24f56c286bd7f2a56027b74/bert-base-uncased_vocab.txt"
Insert cell
// vocab = loadVocab(DEFAULT_VOCAB_PATH)
Insert cell
// {
// const i = 15630;
// return [
// n_vocab[i],
// vocab[i],
// n_vocab[i][0] == "[" && n_vocab[i][n_vocab[i].length - 1] == "]"
// ];
// }
Insert cell
// vocab.map((x, i) => [x, i]).filter((x, i) => x[0] == "mis")
Insert cell
// n_vocab[15630].slice(2)
// "".slice(0, 2)
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
"Shubhanshu ,̆,̇,̈,̉,̊,̋ Mishra ".replaceAll(nsmarksRegex, "")
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
import { Textarea } from "@observablehq/inputs"
Insert cell
d3 = require("d3")
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more