Published
Edited
Mar 14, 2019
Fork of PSET 4
Insert cell
Insert cell
Insert cell
Insert cell
// import vegalite as we'll need it for graphing later
vegalite = require("@observablehq/vega-lite@0.1")
Insert cell
// import package for making matrices of graphs as we'll need it later
embed = require("vega-embed@3")
Insert cell
// import zebras library for handling dfs
z = require('https://bundle.run/zebras@0.0.11')
Insert cell
// import d3 library
d3 = require("d3")
Insert cell
// migration data URL
migrationURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/6dff0d45ed26352a7a3c7afb4ace0bad1ce8ba20/MIG_18022019163824069.csv'
Insert cell
// country dataset URL
countryURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/90951b60444376eebfcbbee9beca00c083f489f6/Metadata_Country_API_SM.POP.NETM_DS2_en_csv_v2_10473747.csv'
Insert cell
// load migration dataset
migrationRaw = d3.csv(migrationURL)
Insert cell
// parse values and years into numeric from string so that we can filter, sum over it later
// I drop the CO2 = TOT rows, because those are totals and not actually country values.
// Dropping CO2 = TOT matters for Problem 4
migration = z.filter(r => r["CO2"] != "TOT", z.parseNums(['Value', 'Year'], migrationRaw))
Insert cell
// load country dataset
country = d3.csv(countryURL)
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
// what variables do we have?
migrationVar = z.unique(z.getCol("Variable", migration))
Insert cell
// what countries CO2 do we have?
countriesCO2 = z.unique(z.getCol("CO2", migration))
Insert cell
Insert cell
// Note: I think z.merge is not a true left join--it only retains the first instance of each item of the key array. therefore, the resulting join table is much smaller than the left hand side dataframe being joined upon.
// So let's try a new function that left joins a column. Foundation for next function that left joins multiple columns
// assume that df_right has unique rows for each value of right_on (so we don't need the suffix part)
// dfLeft and dfRight are the left and right dfs, respectively; leftOn and rightOn are the columns on which you are joining
//rightCol is the additional data from the right df that we want to tack onto the left df
leftjoin_col = function(dfLeft, dfRight, leftOn, rightOn, rightCol)
{
let lookup = {};
// create a look-up table where the rightOn key is associated with the value
// that we want to append to the left df; do this row by row
dfRight.forEach(function(row)
{
lookup[row[rightOn]] = row[rightCol]
})
// for the left df, look up its keys in the look-up table and fetch the associated rightCol value
let new_col = dfLeft.map(x => lookup[x[leftOn]])
return z.addCol(rightCol, new_col, dfLeft) // add that column to the left df to complete the join
}

Insert cell
// second function building off the previous one that performs left join for multiple columns by looping through the columns that need to be joined and applying the previous function repeatedly
// assume that there are no repeated column names in left_df and right_df (except maybe left_on and right_on)
leftjoin = function(dfLeft, dfRight, leftOn, rightOn) {
let retDf = null
for (const col in dfRight["0"]) {
if (col != rightOn)
{
if (retDf === null)
{
retDf = leftjoin_col(dfLeft, dfRight, leftOn, rightOn, col)
}
else
{
retDf = leftjoin_col(retDf, dfRight, leftOn, rightOn, col)
}
}
}
return retDf
}
Insert cell
// try the join again using the custom function
migrationCountry = leftjoin(migration, country, "CO2", "Country Code")
Insert cell
Insert cell
// what income levels do we have?
z.unique(z.getCol("IncomeGroup", migrationCountry))
Insert cell
Insert cell
// reformat full migration-income data from long to wide while grouping by IncomeGroup and summing by Variable
// Thanks to Yichun for pointing out that I should use vconcat for Problem 3, which then made me realized I need to reformat the migration data from long to wide in order to make them into a matrix of graphs with adjustable scales using vconcat

// first step, group by IncomeGroup
// also, get rid of unneeded columns
migrationByIncGroup = z.pipe([
z.pickCols(["CO2", "Variable", "Year", "Value", "Region", "IncomeGroup"]),
z.groupBy(r => r["IncomeGroup"])
])(migrationCountry)
Insert cell
Insert cell
// second step, within each country, sum over the values for each category
// so, each row should be a CO2, and columns will be sum of inflows, sum of outflows, etc
// retain the income group information as well

migrationWide = {
var resultsTable = [];
var incomeGroup = Object.keys(migrationByIncGroup);
//loop through income groups since that's how our table is grouped
for (let i = 0; i < incomeGroup.length; i++)
{
var incTable = z.groupBy(r => r["CO2"], migrationByIncGroup[incomeGroup[i]]);
var incCO2 = Object.keys(incTable);

//loop through all countries within each income group and make them into unique rows of output table
for (let j = 0; j < incCO2.length; j++)
{
let currRow = {};
currRow.CO2 = incCO2[j];
currRow.IncomeGroup = incomeGroup[i];

// then fill out value columns for each country
let varSum = z.gbSum("Value", z.groupBy(r => r.Variable, incTable[incCO2[j]]));
for (let k = 0; k < z.getCol("Variable", varSum).length; k++)
{
let currVarName = varSum[k].group; //order of variables may differ for each
currRow[`${currVarName}`] = varSum[k].sum;
}
resultsTable.push(currRow);
}
}
return resultsTable;
}
Insert cell
// make a version of the wide table above grouped by IncomeGroup
// above table is not yet grouped by IncomeGroup, because I realized when doing Problem 4 that I needed a version of the migration wide format table that was not income-grouped
migrationWide1 = z.groupBy(r => r["IncomeGroup"], migrationWide)
Insert cell
// creates a matrix of immigration graphs using vconcat to make each row

viewof histogramMatrix = embed({
title: "Immigration measures by country income group",
vconcat:[
{
title: "Low income countries",
repeat: {column: migrationVar},
spec:{
data: {values: migrationWide1["Low income"]},
mark: "bar",
encoding: {
x: {
bin:{"maxbins" : 12},
// I wasn't sure how to set each graph to have exactly 12 bins and
// still have each variable show up on a reasonable range of x-axis values.
// So, I just used "maxbin" : 12, which picks reasonable bin sizes and delivers up to 12 bins
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
},
{
title: "Lower middle income countries",
repeat: {column: migrationVar},
spec:{
data: {values: migrationWide1["Lower middle income"]},
mark: "bar",
encoding: {
x: {
bin:{"maxbins" : 12},
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
},
{
title: "Upper middle income countries",
repeat: {column: migrationVar},
spec:{
data: {values: migrationWide1["Upper middle income"]},
mark: "bar",
encoding: {
x: {
bin:{"maxbins" : 12},
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
},
{
title: "High income countries",
repeat: {column: migrationVar},
spec:{
data: {values: migrationWide1["High income"]},
mark: "bar",
encoding: {
x: {
bin:{"maxbins" : 12},
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
},
{
title: "Unknown income countries",
// my x-axis here is probably tighter than other people's because I dropped CO2 = "TOT", which had
// huge immigration figures since it's the sum over multiple countries
repeat: {column: migrationVar},
spec:{
data: {values: migrationWide1[undefined]},
mark: "bar",
encoding: {
x: {
bin:{"maxbins" : 12},
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
}
]
})
Insert cell
Insert cell
viewof view_stackBar= embed({
title: "Migration flows, stacked by country income level",
repeat: {column : migrationVar},
data: {values: migrationWide},
spec:
{
mark: "bar",
encoding: {
x:{
bin:{"maxbins" : 12},
field: {repeat: "column"},
type: "quantitative",
},
y: {
aggregate: "count",
field: "CO2",
type: "nominal",
stack: "normalize", //this makes the normalized stack bar graph
axis: {"title": "Country count"}
},
color: {
field: "IncomeGroup",
type: "nominal"
}
}
}
})

Insert cell
Insert cell
Insert cell
// GDP per capita per country dataset, pulled from World Bank and loaded through Github
gdpURL = 'https://gist.githubusercontent.com/rlluo1/59046b73d6e0f5c5f3fb330b241007e3/raw/474b199788921546015bf3a12d5313b1c837b699/gdp_per_capita_current'
Insert cell
// load in data
gdpRaw = d3.csv(gdpURL)
Insert cell
// parse GDP data
gdp = z.parseNums(['2017_gdppc'], gdpRaw)
Insert cell
//check it out
z.head(20,gdp)
Insert cell
// note that 2017 data only has 3 variables instead of 8 listed in the original dataset
// I choose to plot "Stock of foreign-born population by country of birth" as the indicator of the "immigration level" that Question 4 asks for, since the question did not define which variable to use to proxy for "immigration level"
z.unique(z.getCol("Variable", z.filter(r => r["Year"] === 2017, migrationCountry)))
Insert cell
// select 2017 data from migration dataset; note again that we've dropped CO2 = "TOT" earlier in the pset
varCO22017 = z.filter(r => r["Year"] === 2017, migrationCountry)
Insert cell
Insert cell
//produce join key for original 2017 data table so that I can get the CO2 and variable names I need for sums table
key2017 = z.deriveCol(r => (r.CO2 + ","+ r.Variable), varCO22017)
Insert cell
//add key column to 2017 table, drop unnecessary columns
varCO22017Keyed = z.addCol("key", key2017, z.pickCols(["CO2","IncomeGroup","Region","Variable"], varCO22017))
Insert cell
//add 2017 table country data to sums table
varCO22017SumsDeet = leftjoin(varCO22017Sums, varCO22017Keyed, "group", "key")
Insert cell
//merge info from keyed 2017 table onto the sums table
migrationGDP2017 = leftjoin(varCO22017SumsDeet, gdp, "CO2", "CO2")
Insert cell
Insert cell
// only graphing 1 of 3 migration variables available for 2017
// picked the one variable that seems to be a comprehensive descriptor of "immigrant levels" and thus is relevant to the question
// don't include total rows, ie CO2 == TOT
vegalite({
data: {values: z.filter(r => r["Variable"] === "Stock of foreign population by nationality" && r["CO2"] != "TOT", migrationGDP2017)},
mark: "point",
encoding: {
x: {
field: "sum",
type: "quantitative",
axis: {"title": "Stock of foreign/immigrant population"}
},
y: {
field: "2017_gdppc",
type: "quantitative",
axis: {"title": "2017 per capita GDP"}
},
color: {
field: "Region",
type: "nominal"
},
}
})
Insert cell
// same as above but dropping the outliers
vegalite({
data: {values: z.filter(r => r["Variable"] === "Stock of foreign population by nationality" &&
r["CO2"] != "TOT" &&
r["sum"] < 4000001 &&
r["2017_gdppc"] < 150000, migrationGDP2017)},
mark: "point",
encoding: {
x: {
field: "sum",
type: "quantitative",
axis: {"title": "Stock of foreign/immigrant population"}
},
y: {
field: "2017_gdppc",
type: "quantitative",
axis: {"title": "2017 per capita GDP"}
},
color: {
field: "Region",
type: "nominal"
},
}
})
Insert cell
Insert cell
// create new table in which each row is CO2_variable_year
// this needs aggregating because each CO2 has multiple rows for recipient country
varCO2AllYearsSums = z.gbSum("Value", z.groupBy(r => [r["CO2"], r["Variable"], r["Year"]], migration))
Insert cell
//join key
keyAllYears = z.deriveCol(r => (r.CO2 + ","+ r.Variable + "," + r.Year), migration)
Insert cell
//add key to migration table, drop unneeded columns
migrationYearKeyed = z.addCol("key", keyAllYears, z.pickCols(["CO2", "Variable", "Year"], migration))
Insert cell
//merge keyed migration table onto the sums table
migrationByYear = leftjoin(varCO2AllYearsSums, migrationYearKeyed, "group", "key")
Insert cell
Insert cell
// Note that only the stock figures and asylum inflow variables (3 variables total) had 2017 numbers
// I only graphed inflows of foreign population by nationality
vegalite({
data: {values: z.filter(r => r["Variable"] === "Inflows of foreign population by nationality" &&
r["CO2"] != "TOT", migrationByYear)},
layer: [
{
mark: "point",
encoding: {
y: {
field: "sum",
type: "quantitative",
axis: {"title": "In migration"}
},
x: {
field: "Year",
type: "quantitative",
//data is not in date format so we just use quantitative type;
//tried to use temporal but it wasn't working well.
axis: {"title": "Year",
"format": "f",
"labelAngle": -45},
scale: {"domain": [2000, 2017]}
},
}
},
{
mark: "line",
encoding: {
y: {
field: "sum",
type: "quantitative",
aggregate: "mean",
axis: {"title" : "In migration"}
},
x: {
field: "Year",
type: "quantitative",
//data is not in date format so we just use quantitative type
//tried to use temporal but it wasn't working well.
axis: {"title": "Year",
"format": "f",
"labelAngle": -45},
scale: {"domain": [2000, 2017]}
},
color: {"value": "firebrick"}
}
}
]
})



Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more