Published
Edited
Dec 3, 2021
Insert cell
Insert cell
Inputs.table(data.map((d) => ({ original: d, fingerprint: fingerprint(d) })))
Insert cell
removeNonDiacriticSpecialChars(
"Caféææ".normalize("NFD").replace(/\p{Diacritic}/gu, "")
)
Insert cell
Inputs.table(data.map((d) => ({ original: d, ngram: ngram(d) })))
Insert cell
function ngram(dirty, n = 2) {
/*
From https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth
and https://github.com/OpenRefine/OpenRefine/blob/master/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java
*/
let clean = dirty;
clean = clean
.toLowerCase()
.replace(/\p{P}|\p{Control}|\s+/gu, "") //Keep only letters
.normalize("NFD") //Normalize unicode into decomposed form
.replace(/\p{Diacritic}/gu, ""); //Strip diacritics
clean = removeNonDiacriticSpecialChars(clean); //Remove non diacritic special characters

let allNgrams = [];
for (var i = 0; i <= clean.length - n; i++) {
allNgrams.push(clean.slice(i, i + n));
}

const sortedNgrams = allNgrams.sort((a, b) => a.localeCompare(b)); //Sort alphabetically
const dedupNgrams = Array.from(new Set(sortedNgrams)) //Deduplicate
.join("");

return dedupNgrams;
}
Insert cell
function fingerprint(dirty) {
let clean = dirty;
clean = clean
.trim()
.toLowerCase()
.replace(/\p{P}|\p{Control}/gu, "") //Keep only letters and whitespace
.normalize("NFD") //Normalize unicode into decomposed form
.replace(/\p{Diacritic}/gu, ""); //Strip diacritics

clean = removeNonDiacriticSpecialChars(clean) //Remove non diacritic special characters
.split(/\s+/) //Tokenize based on whitespace
.sort((a, b) => a.localeCompare(b)); //Sort alphabetically

clean = Array.from(new Set(clean)) //Deduplicate
.join(" "); //Put back together to make string

return clean;
}
Insert cell
function removeNonDiacriticSpecialChars(dirty) {
/*
List is from https://github.com/OpenRefine/OpenRefine/blob/master/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java
which also cites: https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
*/
let clean = dirty;
[
["ß", "ss"],
["æ", "ae"],
["ø", "oe"],
["å", "aa"],
["©", "c"],
["ð", "d"],
["đ", "d"],
["ɖ", "d"],
["þ", "th"],
["ƿ", "w"],
["ħ", "h"],
["ı", "i"],
["ĸ", "k"],
["ł", "l"],
["ŋ", "n"],
["ſ", "s"],
["ŧ", "t"],
["œ", "oe"],
["ẜ", "s"],
["ẝ", "s"]
].forEach(function (d) {
clean = clean.replace(new RegExp(d[0], "g"), d[1]);
});
return clean;
}
Insert cell
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more