Published
Edited
Mar 13, 2019
1 fork
Insert cell
Insert cell
Insert cell
Insert cell
// import vegalite
vegalite = require("@observablehq/vega-lite@0.1")
Insert cell
Insert cell
// Loading necessary library
z = require('https://bundle.run/zebras@0.0.11')
Insert cell
// Loading necessary library
d3 = require("d3")
Insert cell
// migration data URL
migrationURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/6dff0d45ed26352a7a3c7afb4ace0bad1ce8ba20/MIG_18022019163824069.csv'
Insert cell
countryURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/90951b60444376eebfcbbee9beca00c083f489f6/Metadata_Country_API_SM.POP.NETM_DS2_en_csv_v2_10473747.csv'
Insert cell
// load migration dataset
migrationRaw = d3.csv(migrationURL)
Insert cell
migration = z.parseNums(['Value','Year'], migrationRaw)
Insert cell
// load country dataset
country = d3.csv(countryURL)
Insert cell
//check out migration dataset
z.head(10, migration)
Insert cell
// check out shape of migration dataset
shape = ({rows: z.getCol('CO2', migration).length, columns:Object.keys(migration[0]).length})
Insert cell
//check out country dataset
shapeCountry = ({rows: z.getCol('Country Code', country).length, columns:Object.keys(country[0]).length})
Insert cell
//check out country dataset
z.head(20, country)
Insert cell
Insert cell
// Join based on country code column in the country dataset, and the CO2 code in the migration dataset
migrationCountryTest = z.merge(migration, country, "CO2", "Country Code", "_m", "_c")
//migrationCountryTest = z.merge(country, migration, "Country Code", "CO2", "_m", "_c")
// Note: I think z.merge is not a true left join--it only retains the first instance of each item of the key array. therefore, the resulting join table is much smaller than the left hand side dataframe being joined upon. See source code line 87 https://zebrasjs.com/merge.js.html

Insert cell
Insert cell
Insert cell
// try the join again using the custom function
migrationCountry = leftjoin(migration, country, "CO2", "Country Code")
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
// group on a key that I generate, CO2_variable, which is unique for every income group+variable combination
varCO2Sums = z.gbSum("Value", z.groupBy(r => [r["CO2"], r["Variable"]], migrationCountry))
Insert cell
// generate country, variable columns that aren't concated together, so that I can graph it
// include on it the country income level, region data
// begin by creating a join key
key = z.deriveCol(r => (r.CO2 + ","+ r.Variable), migrationCountry)
Insert cell
// then add column key to the migration data table, dropping the unnecessary columns
migrationCountryKeyed = z.addCol("key", key, z.pickCols(["CO2","IncomeGroup","Variable"], migrationCountry))
Insert cell
// join the variable, income group names to the aggregate table
incomeVar = leftjoin(varCO2Sums, migrationCountryKeyed, "group" , "key")
Insert cell
// drop the undefined income and variable rows
incomeVarDefined = z.pipe([
z.filter( r => r['Variable'] == "Inflows of foreign population by nationality" ||
r['Variable'] == "Outflows of foreign population by nationality" ||
r['Variable'] == "Stock of foreign-born population by country of birth" ||
r['Variable'] == "Inflows of asylum seekers by nationality" ||
r['Variable'] == "Stock of foreign population by nationality"||
r['Variable'] == "Acquisition of nationality by country of former nationality" ||
r['Variable'] == "Stock of foreign-born labour by country of birth" ||
r['Variable'] == "Stock of foreign labour by nationality"),
z.filter( r => r['IncomeGroup'] == "Low income" ||
r['IncomeGroup'] == "Lower middle income" ||
r['IncomeGroup'] == "Upper middle income" ||
r['IncomeGroup'] == "High income"
)
])(incomeVar)
Insert cell
Insert cell
incomeGroups = z.unique(z.getCol('IncomeGroup', incomeVarDefined))
Insert cell
migVar = z.unique(z.getCol("Variable",incomeVarDefined))
Insert cell
viewof histogramMatrix = embed({
title: "Immigration measures by income group",
vconcat:[{
repeat: {column: z.getCol("Variable",incomeVarDefined)},
spec:{
data: {values: z.filter}, //reformat data?
mark: "bar",
encoding: {
x: {
bin:{"binned": true, "step": 700000},
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
},
{
repeat: {column: migVar},
spec:{
data: {values: z.filter(r => r["IncomeGroup"] === "Lower middle income", incomeVarDefined)},
mark: "bar",
encoding: {
x: {
bin:{"binned": true, "step": 700000},
field:{"repeat":"column"},
type:"quantitative"
},
y: {
aggregate: "count",
field:"CO2",
type:"quantitative",
title: "Country Count"
}
}
}
}

]
})
Insert cell
// how to get x-scales to be different across columns? can try vconcat on grouups of columns withsimilar scales!
vegalite({
data: {values: incomeVarDefined},
mark: "bar",
encoding: {
x: {
bin: {
"binned": true
},
field: "sum",
type: "quantitative",
axis: {"title": "Total"},
//scale: {"domain": [0,20000000]}
},
y: {
aggregate: "count",
field: "CO2",
type: "nominal",
axis: {"title": "Country count"}
},
row: {
field: "IncomeGroup",
type: "nominal"
},
column: {
field: "Variable",
type: "nominal",
}
}
})

Insert cell
Insert cell
viewof view_stackBar= embed({
title: "Immigration by income groups and migration categories",
}})
Insert cell
// followed Stacked guide: https://vega.github.io/vega-lite/docs/stack.html
vegalite({
data: {values: incomeVarDefined},

mark: "bar",
encoding: {
x: {
bin: {
"binned": true,
},
field: "sum",
type: "quantitative",
axis: {"title": "Total"},
//scale: {"domain": [0,20000000]}
},
y: {
aggregate: "count",
field: "CO2",
type: "nominal",
stack: "normalize",
axis: {"title": "Country count"}
},
color: {
field: "IncomeGroup",
type: "nominal"
},

column: {
field: "Variable",
type: "nominal"
}
}

})


Insert cell
Insert cell
// GDP per capita per country dataset, pulled from World Bank and loaded through Github
gdpURL = 'https://gist.githubusercontent.com/rlluo1/59046b73d6e0f5c5f3fb330b241007e3/raw/474b199788921546015bf3a12d5313b1c837b699/gdp_per_capita_current'
Insert cell
// load in data
gdpRaw = d3.csv(gdpURL)
Insert cell
gdp = z.parseNums(['2017_gdppc'], gdpRaw)
Insert cell
//check it out
z.head(20,gdp)
Insert cell
//in my migration dataset, select 2017 data only,
varCO22017 = z.filter(r => r["Year"] === 2017, migrationCountry)
Insert cell
// note that 2017 data only has 3 variables instead of 8
// there are still 210 CO2s, so we expect on order of 3*210 = 630 rows when we aggregate by country-variable
z.unique(z.getCol("Variable", varCO22017))
Insert cell
//then aggregate by CO2 & variable, ie, we sum over all the Countries for each Country of Birth
//now each CO2 - variable is a row
varCO22017Sums = z.gbSum("Value", z.groupBy(r => [r["CO2"], r["Variable"]], varCO22017))
Insert cell
//produce join key for original 2017 data table so that I can get the CO2 and variable names I need for sums table
key2017 = z.deriveCol(r => (r.CO2 + ","+ r.Variable), varCO22017)
Insert cell
//add key column to 2017 table, drop unnecessary columns
varCO22017Keyed = z.addCol("key", key2017, z.pickCols(["CO2","IncomeGroup","Region","Variable"], varCO22017))
Insert cell
//add 2017 table country data to sums table
varCO22017SumsDeet = leftjoin(varCO22017Sums, varCO22017Keyed, "group", "key")
Insert cell
//merge info from keyed 2017 table onto the sums table
migrationGDP2017 = leftjoin(varCO22017SumsDeet, gdp, "CO2", "CO2")
Insert cell
Insert cell
// only graphing 1 of 3 migration variables because axis scales don't match across the three
// picked the one variable most relevant to the question
// don't include total rows, ie CO2 == TOT
vegalite({
data: {values: z.filter(r => r["Variable"] === "Stock of foreign population by nationality" && r["CO2"] != "TOT", migrationGDP2017)},
mark: "point",
encoding: {
x: {
field: "sum",
type: "quantitative",
axis: {"title": "Stock of foreign/immigrant population"}
},
y: {
field: "2017_gdppc",
type: "quantitative",
axis: {"title": "2017 per capita GDP"}
},
color: {
field: "Region",
type: "nominal"
},
}
})
Insert cell
// same as above but dropping the outliers
vegalite({
data: {values: z.filter(r => r["Variable"] === "Stock of foreign population by nationality" &&
r["CO2"] != "TOT" &&
r["sum"] < 4000001 &&
r["2017_gdppc"] < 150000, migrationGDP2017)},
mark: "point",
encoding: {
x: {
field: "sum",
type: "quantitative",
axis: {"title": "Stock of foreign/immigrant population"}
},
y: {
field: "2017_gdppc",
type: "quantitative",
axis: {"title": "2017 per capita GDP"}
},
color: {
field: "Region",
type: "nominal"
},
}
})
Insert cell
Insert cell
Insert cell
// create new table in which each row is CO2_variable_year
// this needs aggregating because each CO2 has multiple rows for recipient country
varCO2AllYearsSums = z.gbSum("Value", z.groupBy(r => [r["CO2"], r["Variable"], r["Year"]], migration))
Insert cell
//join key
keyAllYears = z.deriveCol(r => (r.CO2 + ","+ r.Variable + "," + r.Year), migration)
Insert cell
//add key to migration table, drop unneeded columns
migrationYearKeyed = z.addCol("key", keyAllYears, z.pickCols(["CO2", "Variable", "Year"], migration))
Insert cell
//merge keyed migration table onto the sums table
migrationByYear = leftjoin(varCO2AllYearsSums, migrationYearKeyed, "group", "key")
Insert cell
Insert cell
// Note that only the stock figures and asylum inflow variables (3 variables total) had 2017 numbers
vegalite({
data: {values: z.filter(r => r["Variable"] === "Inflows of foreign population by nationality" &&
r["CO2"] != "TOT", migrationByYear)},
layer: [
{
mark: "point",
encoding: {
y: {
field: "sum",
type: "quantitative",
axis: {"title": "In migration"}
},
x: {
field: "Year",
type: "quantitative",
//data is not in date format so we just use quantitative type
axis: {"title": "Year"},
format: "",
scale: {"domain": [2000, 2017]}
},
}
},
{
mark: "line",
encoding: {
y: {
field: "sum",
type: "quantitative",
aggregate: "mean",
axis: {"title" : "In migration"}
},
x: {
field: "Year",
type: "quantitative",
//data is not in date format so we just use quantitative type
axis: {"title": "Year"},
format: "d",
scale: {"domain": [2000, 2017]}
},
color: {"value": "firebrick"}
}
}
]
})

Insert cell
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more