Published
Edited
Mar 14, 2019
Fork of PSET 4
Insert cell
Insert cell
Insert cell
Insert cell
// import vegalite as we'll need it for graphing later
vegalite = require("@observablehq/vega-lite@0.1")
Insert cell
// import package for making matrices of graphs as we'll need it later
embed = require("vega-embed@3")
Insert cell
// import zebras library for handling dfs
z = require('https://bundle.run/zebras@0.0.11')
Insert cell
// import d3 library
d3 = require("d3")
Insert cell
// migration data URL
migrationURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/6dff0d45ed26352a7a3c7afb4ace0bad1ce8ba20/MIG_18022019163824069.csv'
Insert cell
// country dataset URL
countryURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/90951b60444376eebfcbbee9beca00c083f489f6/Metadata_Country_API_SM.POP.NETM_DS2_en_csv_v2_10473747.csv'
Insert cell
// load migration dataset
migrationRaw = d3.csv(migrationURL)
Insert cell
// parse values and years into numeric from string so that we can filter, sum over it later
// I drop the CO2 = TOT rows, because those are totals and not actually country values.
// Dropping CO2 = TOT matters for Problem 4
migration = z.filter(r => r["CO2"] != "TOT", z.parseNums(['Value', 'Year'], migrationRaw))
Insert cell
// load country dataset
country = d3.csv(countryURL)
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
// what variables do we have?
migrationVar = z.unique(z.getCol("Variable", migration))
Insert cell
// what countries CO2 do we have?
countriesCO2 = z.unique(z.getCol("CO2", migration))
Insert cell
Insert cell
// Note: I think z.merge is not a true left join--it only retains the first instance of each item of the key array. therefore, the resulting join table is much smaller than the left hand side dataframe being joined upon.
// So let's try a new function that left joins a column. Foundation for next function that left joins multiple columns
// assume that df_right has unique rows for each value of right_on (so we don't need the suffix part)
// dfLeft and dfRight are the left and right dfs, respectively; leftOn and rightOn are the columns on which you are joining
//rightCol is the additional data from the right df that we want to tack onto the left df
leftjoin_col = function(dfLeft, dfRight, leftOn, rightOn, rightCol)
{
let lookup = {};
// create a look-up table where the rightOn key is associated with the value
// that we want to append to the left df; do this row by row
dfRight.forEach(function(row)
{
lookup[row[rightOn]] = row[rightCol]
})
// for the left df, look up its keys in the look-up table and fetch the associated rightCol value
let new_col = dfLeft.map(x => lookup[x[leftOn]])
return z.addCol(rightCol, new_col, dfLeft) // add that column to the left df to complete the join
}

Insert cell
// second function building off the previous one that performs left join for multiple columns by looping through the columns that need to be joined and applying the previous function repeatedly
// assume that there are no repeated column names in left_df and right_df (except maybe left_on and right_on)
leftjoin = function(dfLeft, dfRight, leftOn, rightOn) {
let retDf = null
for (const col in dfRight["0"]) {
if (col != rightOn)
{
if (retDf === null)
{
retDf = leftjoin_col(dfLeft, dfRight, leftOn, rightOn, col)
}
else
{
retDf = leftjoin_col(retDf, dfRight, leftOn, rightOn, col)
}
}
}
return retDf
}
Insert cell
// try the join again using the custom function
migrationCountry = leftjoin(migration, country, "CO2", "Country Code")
Insert cell
Insert cell
// what income levels do we have?
z.unique(z.getCol("IncomeGroup", migrationCountry))
Insert cell
Insert cell
// reformat full migration-income data from long to wide while grouping by IncomeGroup and summing by Variable
// Thanks to Yichun for pointing out that I should use vconcat for Problem 3, which then made me realized I need to reformat the migration data from long to wide in order to make them into a matrix of graphs with adjustable scales using vconcat

// first step, group by IncomeGroup
// also, get rid of unneeded columns
migrationByIncGroup = z.pipe([
z.pickCols(["CO2", "Variable", "Year", "Value", "Region", "IncomeGroup"]),
z.groupBy(r => r["IncomeGroup"])
])(migrationCountry)
Insert cell
Insert cell
// second step, within each country, sum over the values for each category
// so, each row should be a CO2, and columns will be sum of inflows, sum of outflows, etc
// retain the income group information as well

migrationWide = {
var resultsTable = [];
var incomeGroup = Object.keys(migrationByIncGroup);
//loop through income groups since that's how our table is grouped
for (let i = 0; i < incomeGroup.length; i++)
{
var incTable = z.groupBy(r => r["CO2"], migrationByIncGroup[incomeGroup[i]]);
var incCO2 = Object.keys(incTable);

//loop through all countries within each income group and make them into unique rows of output table
for (let j = 0; j < incCO2.length; j++)
{
let currRow = {};
currRow.CO2 = incCO2[j];
currRow.IncomeGroup = incomeGroup[i];

// then fill out value columns for each country
let varSum = z.gbSum("Value", z.groupBy(r => r.Variable, incTable[incCO2[j]]));
for (let k = 0; k < z.getCol("Variable", varSum).length; k++)
{
let currVarName = varSum[k].group; //order of variables may differ for each
currRow[`${currVarName}`] = varSum[k].sum;
}
resultsTable.push(currRow);
}
}
return resultsTable;
}
Insert cell
// make a version of the wide table above grouped by IncomeGroup
// above table is not yet grouped by IncomeGroup, because I realized when doing Problem 4 that I needed a version of the migration wide format table that was not income-grouped
migrationWide1 = z.groupBy(r => r["IncomeGroup"], migrationWide)
Insert cell
// creates a matrix of immigration graphs using vconcat to make each row

viewof histogramMatrix = embed({
title: "Immigration measures by country income group",
vconcat:[
{
title: "Low income countries",
repeat: {column: migrationVar},
spec:{
data: {values: migrationWide1["Low income"]},
mark: "bar",
encoding: {
x: {
bin:{"maxbins" : 12},
// I wasn't sure how to set each graph to have exactly 12 bins and
// still have each variable show up on a reasonable range of x-axis values.
// So, I just used "maxbin" : 12, which picks reasonable bin sizes and delivers up to 12 bins
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
},
{
title: "Lower middle income countries",
repeat: {column: migrationVar},
spec:{
data: {values: migrationWide1["Lower middle income"]},
mark: "bar",
encoding: {
x: {
bin:{"maxbins" : 12},
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
},
{
title: "Upper middle income countries",
repeat: {column: migrationVar},
spec:{
data: {values: migrationWide1["Upper middle income"]},
mark: "bar",
encoding: {
x: {
bin:{"maxbins" : 12},
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
},
{
title: "High income countries",
repeat: {column: migrationVar},
spec:{
data: {values: migrationWide1["High income"]},
mark: "bar",
encoding: {
x: {
bin:{"maxbins" : 12},
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
},
{
title: "Unknown income countries",
// my x-axis here is probably tighter than other people's because I dropped CO2 = "TOT", which had
// huge immigration figures since it's the sum over multiple countries
repeat: {column: migrationVar},
spec:{
data: {values: migrationWide1[undefined]},
mark: "bar",
encoding: {
x: {
bin:{"maxbins" : 12},
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
}
]
})
Insert cell
Insert cell
viewof view_stackBar= embed({
title: "Migration flows, stacked by country income level",
repeat: {column : migrationVar},
data: {values: migrationWide},
spec:
{
mark: "bar",
encoding: {
x:{
bin:{"maxbins" : 12},
field: {repeat: "column"},
type: "quantitative",
},
y: {
aggregate: "count",
field: "CO2",
type: "nominal",
stack: "normalize", //this makes the normalized stack bar graph
axis: {"title": "Country count"}
},
color: {
field: "IncomeGroup",
type: "nominal"
}
}
}
})

Insert cell
Insert cell
Insert cell
// GDP per capita per country dataset, pulled from World Bank and loaded through Github
gdpURL = 'https://gist.githubusercontent.com/rlluo1/59046b73d6e0f5c5f3fb330b241007e3/raw/474b199788921546015bf3a12d5313b1c837b699/gdp_per_capita_current'
Insert cell
// load in data
gdpRaw = d3.csv(gdpURL)
Insert cell
// parse GDP data
gdp = z.parseNums(['2017_gdppc'], gdpRaw)
Insert cell
//check it out
z.head(20,gdp)
Insert cell
// note that 2017 data only has 3 variables instead of 8 listed in the original dataset
// I choose to plot "Stock of foreign-born population by country of birth" as the indicator of the "immigration level" that Question 4 asks for, since the question did not define which variable to use to proxy for "immigration level"
z.unique(z.getCol("Variable", z.filter(r => r["Year"] === 2017, migrationCountry)))
Insert cell
// select 2017 data from migration dataset; note again that we've dropped CO2 = "TOT" earlier in the pset
varCO22017 = z.filter(r => r["Year"] === 2017, migrationCountry)
Insert cell
Insert cell
//produce join key for original 2017 data table so that I can get the CO2 and variable names I need for sums table
key2017 = z.deriveCol(r => (r.CO2 + ","+ r.Variable), varCO22017)
Insert cell
//add key column to 2017 table, drop unnecessary columns
varCO22017Keyed = z.addCol("key", key2017, z.pickCols(["CO2","IncomeGroup","Region","Variable"], varCO22017))
Insert cell
//add 2017 table country data to sums table
varCO22017SumsDeet = leftjoin(varCO22017Sums, varCO22017Keyed, "group", "key")
Insert cell
//merge info from keyed 2017 table onto the sums table
migrationGDP2017 = leftjoin(varCO22017SumsDeet, gdp, "CO2", "CO2")
Insert cell
Insert cell
// only graphing 1 of 3 migration variables available for 2017
// picked the one variable that seems to be a comprehensive descriptor of "immigrant levels" and thus is relevant to the question
// don't include total rows, ie CO2 == TOT
vegalite({
data: {values: z.filter(r => r["Variable"] === "Stock of foreign population by nationality" && r["CO2"] != "TOT", migrationGDP2017)},
mark: "point",
encoding: {
x: {
field: "sum",
type: "quantitative",
axis: {"title": "Stock of foreign/immigrant population"}
},
y: {
field: "2017_gdppc",
type: "quantitative",
axis: {"title": "2017 per capita GDP"}
},
color: {
field: "Region",
type: "nominal"
},
}
})
Insert cell
// same as above but dropping the outliers
vegalite({
data: {values: z.filter(r => r["Variable"] === "Stock of foreign population by nationality" &&
r["CO2"] != "TOT" &&
r["sum"] < 4000001 &&
r["2017_gdppc"] < 150000, migrationGDP2017)},
mark: "point",
encoding: {
x: {
field: "sum",
type: "quantitative",
axis: {"title": "Stock of foreign/immigrant population"}
},
y: {
field: "2017_gdppc",
type: "quantitative",
axis: {"title": "2017 per capita GDP"}
},
color: {
field: "Region",
type: "nominal"
},
}
})
Insert cell
Insert cell
// create new table in which each row is CO2_variable_year
// this needs aggregating because each CO2 has multiple rows for recipient country
varCO2AllYearsSums = z.gbSum("Value", z.groupBy(r => [r["CO2"], r["Variable"], r["Year"]], migration))
Insert cell
//join key
keyAllYears = z.deriveCol(r => (r.CO2 + ","+ r.Variable + "," + r.Year), migration)
Insert cell
//add key to migration table, drop unneeded columns
migrationYearKeyed = z.addCol("key", keyAllYears, z.pickCols(["CO2", "Variable", "Year"], migration))
Insert cell
//merge keyed migration table onto the sums table
migrationByYear = leftjoin(varCO2AllYearsSums, migrationYearKeyed, "group", "key")
Insert cell
Insert cell
// Note that only the stock figures and asylum inflow variables (3 variables total) had 2017 numbers
// I only graphed inflows of foreign population by nationality
vegalite({
data: {values: z.filter(r => r["Variable"] === "Inflows of foreign population by nationality" &&
r["CO2"] != "TOT", migrationByYear)},
layer: [
{
mark: "point",
encoding: {
y: {
field: "sum",
type: "quantitative",
axis: {"title": "In migration"}
},
x: {
field: "Year",
type: "quantitative",
//data is not in date format so we just use quantitative type;
//tried to use temporal but it wasn't working well.
axis: {"title": "Year",
"format": "f",
"labelAngle": -45},
scale: {"domain": [2000, 2017]}
},
}
},
{
mark: "line",
encoding: {
y: {
field: "sum",
type: "quantitative",
aggregate: "mean",
axis: {"title" : "In migration"}
},
x: {
field: "Year",
type: "quantitative",
//data is not in date format so we just use quantitative type
//tried to use temporal but it wasn't working well.
axis: {"title": "Year",
"format": "f",
"labelAngle": -45},
scale: {"domain": [2000, 2017]}
},
color: {"value": "firebrick"}
}
}
]
})



Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more