Public
Edited
Mar 22, 2024
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
chartSplitsByAgeRange = StackedBarChart(splitsByAge, {
x: (d) => d.utterances,
y: (d) => d.locale,
z: (d) => d.ageKey,
xLabel: "# of utterances by age group",
yLabel: "language-locale",
yDomain: d3.groupSort(
splitsByAge,
(D) => d3.sum(D, (d) => d.utterances),
(d) => d.locale
), // sort y by x
zDomain: ages,
color: "golden yellowrod"
})
Insert cell
Insert cell
Insert cell
Insert cell
chartSplitsByAge100Percent = StackedBarChart(splitsByAge, {
x: (d) => d.agePercentage,
y: (d) => d.locale,
z: (d) => d.ageKey,
xLabel: "% of utterances by age group",
yLabel: "language-locale",
yDomain: d3.groupSort(
splitsByAge,
(D) => d3.sum(D, (d) => d.utterances),
(d) => d.locale
), // sort y by x
zDomain: ages,
colors: d3.schemeTableau10
})
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
chartSplitsByGender100Percent = StackedBarChart(splitsByGender, {
x: (d) => d.genderPercentage,
y: (d) => d.locale,
z: (d) => d.genderKey,
xLabel: "% of utterances by gender",
yLabel: "language-locale",
yDomain: d3.groupSort(
splitsByGender,
(D) => d3.sum(D, (d) => d.utterances),
(d) => d.locale
), // sort y by x
zDomain: genders,
colors: d3.schemeTableau10
})
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Plot.plot({
grid: true,
inset: 10,
width: 1200,
height: 1000,
color: {
legend: true
},
x: { label: "↑ Average utterance duration (secs)" },
y: { label: "locale" },
// facet: {
// data: anscombe,
//x: "series"
//},
marks: [
Plot.frame(),
Plot.dot(localeData, {
x: (d) => d[1].avgDurationSecs,
y: (d) => d[0],

sort: { y: "x", reverse: true }
})
]
})
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
totalHoursValidated100Percent = Plot.plot({
grid: true,
width: 1200,
height: 2500,
legend: true,
x: { label: "↑ % of utterances validated" },
y: { label: "locale" },
// facet: {
// data: anscombe,
//x: "series"
//},
marks: [
Plot.barX(localeData, {
x: (d) => d[1].validHrs / d[1].totalHrs,
y: (d) => d[0],
sort: { y: "x", reverse: true },
fill: "#000000aa"
}),
Plot.ruleY([0])
]
})
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
chartClipsByValidationStatus = StackedBarChart(splitsByStatus, {
x: (d) => d.validationClips,
y: (d) => d.locale,
z: (d) => d.validationKey,
xLabel: "# of utterances by validation status",
yLabel: "language-locale",
yDomain: d3.groupSort(
splitsByStatus,
(D) => d3.sum(D, (d) => d.clips),
(d) => d.locale
), // sort y by x
zDomain: splitStatusesValidation,
colors: d3.schemeSet1
})
Insert cell
Insert cell
Insert cell
Insert cell
chartClipsByValidationStatus100Percent = StackedBarChart(splitsByStatus, {
x: (d) => d.validationPercentage,
y: (d) => d.locale,
z: (d) => d.validationKey,
xLabel: "% of utterances by validation status",
yLabel: "language-locale",
yDomain: d3.groupSort(
splitsByStatus,
(D) => d3.sum(D, (d) => d.clips),
(d) => d.locale
), // sort y by x
zDomain: splitStatusesValidation,
colors: d3.schemeSet1
})
Insert cell
Insert cell
Insert cell
Insert cell
chartClipsBySplitStatus = StackedBarChart(splitsByStatus, {
x: (d) => d.validationClips,
y: (d) => d.locale,
z: (d) => d.validationKey,
xLabel: "# of utterances by split status",
yLabel: "language-locale",
yDomain: d3.groupSort(
splitsByStatus,
(D) => d3.sum(D, (d) => d.clips),
(d) => d.locale
), // sort y by x
zDomain: splitStatusesDevTrainTest,
colors: d3.schemeDark2
})
Insert cell
Insert cell
Insert cell
Note that percentage here is calculated as a % of the total number of clips for the language. Not all the clips in the language will be included in the dev-test-train splits, for example clips that are `invalidated`.

### An unknown
One of the things I will need to follow up with Gabe is whether _duplicated_ sentences are removed from the splits. I am not sure.
I know that the CV project is planning to revise how splits are calculated, so this graph will need to be updated in the future.
Insert cell
chartClipsBySplitStatus100Percent = StackedBarChart(splitsByStatus, {
x: (d) => d.validationPercentage,
y: (d) => d.locale,
z: (d) => d.validationKey,
xLabel: "% of utterances by dev test train split status",
yLabel: "language-locale",
yDomain: d3.groupSort(
splitsByStatus,
(D) => d3.sum(D, (d) => d.clips),
(d) => d.locale
), // sort y by x
zDomain: splitStatusesDevTrainTest,
colors: d3.schemeDark2
})
Insert cell
Insert cell
Insert cell
avgClipsPerContributor = Plot.plot({
grid: true,
inset: 10,
width: 1200,
height: 1000,
color: {
legend: true
},
x: { label: "↑ Average number of clips per contributoer" },
y: { label: "locale" },
// facet: {
// data: anscombe,
//x: "series"
//},
marks: [
Plot.frame(),
Plot.dot(localeData, {
x: (d) => parseInt(d[1].clips / d[1].users),
y: (d) => d[0],

sort: { y: "x", reverse: false }
})
]
})
Insert cell
Insert cell
Insert cell
cvdata = FileAttachment("cv-corpus-17.0-2024-03-15.json").json()
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
splitStatusesValidation = ["invalidated", "other", "validated"]
Insert cell
splitStatusesDevTrainTest = ["dev", "test", "train"]
Insert cell
Insert cell
## Imports
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more