Published
Edited
Nov 15, 2021
Insert cell
# Mihir_Minor_Assignment
Insert cell
<br>
#### Link to visualization - https://fivethirtyeight.com/features/straight-outta-compton-is-the-rare-biopic-not-about-white-dudes/
<br>
#### Link to dataset - https://github.com/fivethirtyeight/data/blob/master/biopics/biopics.csv
<br>
Insert cell
### Adding dataset, d3 and arquero libraries
Insert cell
csvURL = "https://raw.githubusercontent.com/fivethirtyeight/data/master/biopics/biopics.csv"
Insert cell
d3 = require("d3")
Insert cell
aq = require("arquero")
Insert cell
biopic = d3.csv(csvURL)
Insert cell
biopic_main = d3.csv(csvURL, d3.autoType)
Insert cell
### Main dataset table
Insert cell
Inputs.table(biopic_main)
Insert cell
import {Wrangler, op} from "@observablehq/data-wrangler"
Insert cell
Wrangler(biopic_main)
Insert cell
### Dataset for male actors
Insert cell
bio_male = aq.from(biopic_main)
.select('title','year_release', 'subject', 'subject_sex', 'lead_actor_actress' )
.filter(d => op.match(d["subject_sex"], "Male"))
.derive({malecount: d => 1})
.objects()
Insert cell
### Dataset for female actors
Insert cell
bio_female = aq.from(biopic_main)
.select('title','year_release', 'subject', 'subject_sex', 'lead_actor_actress' )
.filter(d => op.match(d["subject_sex"], "Female"))
.derive({femalecount: d => 1})
.objects()
Insert cell
vegaEmbed = require("vega-embed")
Insert cell
## Basic visualization - Layered Bar chart 1
#### Dataset for male actors and female actors displayed on different layers
#### Tooltip option used
#### Drawback: Top layer bars occludes the bars from layer behind.
#### Ex: Year 1985 has 4 male and 4 female actors. In the graph, only female bar is visible
#### Original visualization also faces similar issue thus misguiding the viewer :|
Insert cell
vegaEmbed({
width: 875,
height: 400,
layer: [
{
data: { values: bio_male },
mark: { type: "rect", fill: "#1d697c", tooltip: true},
encoding: {
x: { field: "year_release", type: "n" },
y: { field: "malecount", type: "Q", aggregate: "sum" },
}
},
{
data: { values: bio_female },
mark: { type: "rect", fill: "#ffb6c1", tooltip: true},
encoding: {
x: { field: "year_release", type: "N" },
y: { field: "femalecount", type: "Q", aggregate: "sum" },
}
}
]
})
Insert cell
Wrangler(biopic_main)
Insert cell
## Appending 2 columns to the original table (Version 1)
#### 1. Column for total count of actors
#### 2. Column to differentiate male and female actors based on integer datatype - '1' > Male | '2' > Female
Insert cell
// To use copied code replace "data" with your own variable
biopic_count = aq.from(biopic_main)
.select('title','site','country','year_release','box_office','director','number_of_subjects','subject','type_of_subject','race_known','person_of_color','subject_sex','lead_actor_actress')
.derive({count: d => 1})
.derive({sex_count: (d) => (d.subject_sex == "Female" ? '2' : '1')})
.objects() // Uncomment to return an array of objects
Insert cell
Inputs.table(biopic_count)
Insert cell
## Using updated table to make visualization
#### Used single updated table instead of two tables
#### Opacity used to reveal the layer behind
#### Drawback > Tooltip option reads only for layer on top and not the layer below
#### Ex: Check year 1985
Insert cell
vegaEmbed({
width:"890",
height:"400",
data: { values: biopic_count },
transform: [
{ calculate: "datum.sex_count == 2 ? 'Female' : 'Male'", as: "gender" }
],
mark: {type:"bar", tooltip:true},
encoding: {
y: {
field: "count",
type: "Q",
aggregate: "sum",
stack: null
},
x: { field: "year_release", type: "nominal" },
color: {
field: "gender",
scale: {"range": ["#675193", "#ca8861"]}
},
opacity: {"value": 0.7}
}
})
Insert cell
### Modified original table a bit (Version 2)
#### created different columns for male and female actor count, retaining the total count column
#### The values for female actors in male actor count were initially kept null then changed to zero.
#### Vice versa applies for male actors
Insert cell
// To use copied code replace "data" with your own variable
biopic_count1 = aq.from(biopic_main)
.select('title','site','country','year_release','box_office','director','number_of_subjects','subject','type_of_subject','race_known','person_of_color','subject_sex','lead_actor_actress')
.derive({total_count: d => 1})
.derive({male_count: (d) => (d.subject_sex == "Male" ? '1': '0')})
.derive({female_count: (d) => (d.subject_sex == "Female" ? '1': '0')})
.objects() // Uncomment to return an array of objects
Insert cell
Inputs.table(biopic_count1)
Insert cell
### Tried stacking centrally but failed :(
Insert cell
vegaEmbed({
width:"820",
height:"400",
data: { values: biopic_count1 },
transform: [
{ calculate: "datum.male_count == 1 ? 'Man' : datum.female_count == 1 ? 'woman': null ", as: "gender" }
],
mark: {type: "bar", tooltip: true},
encoding: {
y: {
field: "total_count",
type: "Q",
aggregate: "sum",
stack: "center"
},
x: { field: "year_release", type: "nominal", },
color: {
field: "gender",
scale: {"range": ["#675193", "#ca8861"]}
},
opacity: {"value": 0.7}
}
})

Insert cell
### Stacking one above other reveals all the bars but implies that female count is higher than male which is not true
#### Intention was to reveal all the bars, easy for tooltip to scan and display the info
Insert cell
vegaEmbed({
width:"820",
height:"400",
data: { values: biopic_count1 },
transform: [
{ calculate: "datum.male_count == 1 ? 'Male Actor' : 'Female Actor' ", as: "gender" }
],
mark: {type: "bar", tooltip: true},
encoding: {
y: {
field: "total_count",
type: "Q",
aggregate: "sum",
stack: true
},
x: { field: "year_release", type: "nominal", },
color: { field: "gender", type: "nominal" },
opacity:{"value": 0.7}
}
})
Insert cell
## Final Viz v1.1
#### Population pyramid approach makes sense in this case, no overlaps
#### Although tooltip displays granular information, it is too long to read
#### female population is in negative
Insert cell
vegaEmbed({
width:"2000",
height:"1000",
data: { values: biopic_count },
transform: [
{calculate: "datum.sex_count == 2 ? 'Female Actor' : 'Male Actor' ", as: "gender" },
{calculate: "datum.sex_count == 2 ? -datum.count : datum.count", as: "signed_people"}
],
mark: {type:"bar", tooltip: true},
encoding: {
y: {
field: "year_release",
axis: null, sort: "ascending"
},
x: {
aggregate: "sum", field: "signed_people",
title: "population",
axis: {format: "s"},
stack: true
},

color: {
field: "gender",
scale: {"range": ["#675193", "#ca8861"]},
legend: {"orient": "top", "title": null}
},
tooltip: [
{ field: "subject", type: "nominal" },
//{ field: "lead_actor_actress", type: "nominal" },
//{ field: "year_release", type: "nominal" },
//{ field: "title", type: "nominal" }
]
},
config: {
view: {stroke: null},
axis: {grid: false}
}
})



//{ field: "signed_people", aggregate: "sum", type: "nominal" }
Insert cell
### Tried to add count at the end of each bar but tooltip option makes it impossible to display correct values
Insert cell
vegaEmbed({
width:"2000",
height:"1000",
data: { values: biopic_count },
transform: [
{calculate: "datum.sex_count == 2 ? 'Female Actor' : 'Male Actor' ", as: "gender" },
{calculate: "datum.sex_count == 2 ? -datum.count : datum.count", as: "signed_people"}
],
mark: {type:"bar", tooltip: true},
encoding: {
y: {
field: "year_release",
axis: null, sort: "ascending"
},
x: {
aggregate: "sum", field: "signed_people",
title: "population",
axis: {format: "s"},
stack: true
},

color: {
field: "gender",
scale: {"range": ["#675193", "#ca8861"]},
legend: {"orient": "top", "title": null}
},
tooltip: [
{ field: "subject", type: "nominal" },
{ field: "lead_actor_actress", type: "nominal" },
{ field: "year_release", type: "nominal" },
{ field: "title", type: "nominal" }
]
},
config: {
view: {stroke: null},
axis: {grid: false}
},
layer: [{
mark: "bar"
}, {
mark: {
type: "text",
align: "left",
baseline: "middle",
// dx: 3
},
encoding: {
text: {aggregate: "sum", field: "signed_people", type: "quantitative"}
}
},
{
mark: {
type: "text",
align: "right",
baseline: "middle",
// dx: 3
},
encoding: {
text: {aggregate: "sum", field: "signed_people", type: "quantitative"}
}
}
]
})
Insert cell
### commenting tooltips helps to display correct count > Tradeoff??
Insert cell
vegaEmbed({
width:"800",
height:"1000",
data: { values: biopic_count },
transform: [
{calculate: "datum.sex_count == 2 ? 'Female Actor' : 'Male Actor' ", as: "gender" },
{calculate: "datum.sex_count == 2 ? -datum.count : datum.count", as: "signed_people"}
],
mark: {type:"bar", tooltip: true},
encoding: {
y: {
field: "year_release",
axis: null, sort: "ascending"
},
x: {
aggregate: "sum", field: "signed_people",
title: "population",
axis: {format: "s"},
stack: true
},

color: {
field: "gender",
scale: {"range": ["#675193", "#ca8861"]},
legend: {"orient": "top", "title": null}
},
tooltip: [
//{ field: "subject", type: "nominal" },
//{ field: "lead_actor_actress", type: "nominal" },
//{ field: "year_release", type: "nominal" },
//{ field: "title", type: "nominal" }
]
},
config: {
view: {stroke: null},
axis: {grid: false}
},
layer: [{
mark: "bar"
}, {
mark: {
type: "text",
align: "left",
baseline: "middle",
// dx: 3
},
encoding: {
text: {aggregate: "sum", field: "signed_people", type: "quantitative"}
}
},
{
mark: {
type: "text",
align: "right",
baseline: "middle",
// dx: 3
},
encoding: {
text: {aggregate: "sum", field: "signed_people", type: "quantitative"}
}
}
]
})
Insert cell
Inputs.table(biopic_count1)
Insert cell
Inputs.table(biopic_count1)
Insert cell
data = aq
.from(biopic_count1)
.select("year_release", "total_count", "subject_sex")
.orderby(aq.desc("year_release"))
.groupby("year_release", "subject_sex")
.rollup({ totalCount: (d) => aq.op.sum(d.total_count) })
.groupby("year_release")
.pivot("subject_sex", "totalCount")
.objects()
Insert cell
Inputs.table(data)
Insert cell
vegaEmbed({
data: {values: data},
mark: "bar",
encoding: {
y: {field: "year_release", type: "temporal"},
x: {field: "totalCount", type: "quantitative"},
color: {field: "subject_sex", type: "nominal"},
column: {field: "subject_sex", type: "nominal"},
}
})
Insert cell
vegaEmbed({
data: {values: data},
hconcat: [{
transform: [{filter: {field: "subject_sex", equal: "Male"}}],
mark: "bar",
encoding: {
y: {field: "year_release", type: "temporal"},
x: {field: "totalCount", type: "quantitative", scale: {domain: [35,0]}},
color: {field: "subject_sex", type: "nominal"}
}
}, {
transform: [{filter: {field: "subject_sex", equal: "Female"}}],
mark: "bar",
encoding: {
y: {field: "year_release", type: "temporal"},
x: {field: "totalCount", type: "quantitative", scale: {domain: [0,35]}},
color: {field: "subject_sex", type: "nominal"}
}}]
})
Insert cell
Table
- Year
- Gender_Male
- Value -> SUM(total_count)
Insert cell
### Tried another Population pyramid but X axes are not aligned
### Maybe due to unequal number of males and females
Insert cell
vegaEmbed({
data: { values: biopic_count1 },
transform: [
{calculate: "datum.female_count == 1 ? 'Female' : 'Male'", as: "gender"}
],
//spacing: 0,
hconcat: [{
width: 400,
height: 400,
transform: [{
filter: {field: "gender", equal: "Female"}
}],
title: "Female",
mark: {type: "bar", tooltip: true},
encoding: {
y: {
field: "year_release", axis: null, title: null, sort: "descending",
},
x: {
aggregate: "sum", field: "total_count",
title: "population",
axis: {format: "s"},
sort: "descending",
type: "Q"
},
color: {
field: "gender",
scale: {"range": ["#675193", "#ca8861"]},
legend: null
}
}
},
{
width: 400,
height: 400,
transform: [{
filter: {field: "gender", equal: "Male"}
}],
title: "Male",
mark: {type: "bar", tooltip: true},
encoding: {
y: {
field: "year_release", axis: null, "title": null, sort: "descending",
},
x: {
aggregate: "sum", field: "total_count",
title: "population",
axis: {format: "s"},
type: "Q"
},
color: {
field: "gender",
legend: null
}
}
}],
config: {
view: {stroke: null},
axis: {grid: false}
}
}
)

Insert cell
## will improve v1.1

Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more