Published
Edited
Apr 1, 2019
Insert cell
Insert cell
Insert cell
Insert cell
// Loading necessary library
z = require('https://bundle.run/zebras@0.0.11')
Insert cell
// Loading necessary library
d3 = require("d3")
Insert cell
// Target appropriate URL for dataset
migrationURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/6dff0d45ed26352a7a3c7afb4ace0bad1ce8ba20/MIG_18022019163824069.csv'
Insert cell
// Load data from target URL
migration = d3.csv(migrationURL)
Insert cell
// check out dimensions of our dataset
shape = ({rows: z.getCol('CO2', migration).length, columns:Object.keys(migration[0]).length})
Insert cell
// examine the dataset
z.head(20, migration)
Insert cell
Insert cell
// Specify the target column, "Country of birth/nationality," then get the unique values within it
z.unique(z.getCol('Country of birth/nationality', migration))
Insert cell
Insert cell
//overall process:
// create groupby key by concatenating 'country' and 'variable'
// reduce df to the 3 variables we want to look at
// parse values into numbers so that we can sum over them
// group the reduced/subsetted df by country & variable, summing over the country_variable groups (this aggregates over the years)
// convert those sums from a long into a wide table
// NOTE: in this part, we sum over the "Country" variable, not the "Country of birth/nationality" variable that we aggregate over for problems 5-6. This is because the problem says "per country" in bold. sorry if that was a misinterpretation of the problem.

// create groupby key from "Country" and "Variable"
groupLabel = z.deriveCol(r => (r.Variable + r.Country), migration)
Insert cell
// add groupby key to the full migration table
migrationKeyed = z.addCol("key", groupLabel, migration)
Insert cell
// check to make sure key is formed correctly and attended to the main table properly
z.tail(10, migrationKeyed)
Insert cell
// subset to just the variables we want to consider, and convert "Value" to numbers so that we can add them

countryStat = z.pipe([
//subset df to just the relevant variables
z.filter(r => r['Variable'] == "Inflows of foreign population by nationality" ||
r['Variable'] == "Outflows of foreign population by nationality" ||
r['Variable'] == "Stock of foreign-born population by country of birth"),
// parse values into numbers
z.parseNums(['Value']),
])(migrationKeyed)
Insert cell
// groupby to produce a sum for each key (ie, each country and variable combination)
migrationSums = z.gbSum("Value", z.groupBy(d => d.key, countryStat))
Insert cell
// Produces a new table where each row is a country listed in the migration table's "Country" column (NOT 'Country of birth/nationality' column)
// This table also includes 3 additional data columns containing the sum of inflows, sum of outflows, or sum of stocks for that country. For each country-variable combination, we are summing over all other factors including year, gender, country of birth/nationality

df_summed = {
// create temp variables to hold the structure of the new df, ie the countries and variables to be included
let countryList = z.unique(z.getCol("Country", countryStat))
let countryCodes = z.unique(z.getCol("COU", countryStat))
let varList = z.unique(z.getCol("Variable", countryStat))
let columnNames = ["country", "inflow", "outflow", "stock"]
let sums = []
// loop over all countries
// for each country, add a row to output table with the country's total name, code, inflow, outflow, and stock
for (var i = 0; i < countryList.length; i++)
{
let currRow = {country: countryList[i],
code: countryCodes[i],
inflow: 0,
outflow: 0,
stock: 0}
// to fill out the inflow, outflow, and stock information,
// create the variable_country key unique to that value and index into the migrationSums table
// to fetch that value and add it tto our output ttable
let tag = ""
for (var j = 0; j < varList.length; j++)
{
tag = varList[j] + countryList[i]
currRow[columnNames[j+1]] = z.getCol("sum", (z.filter(r => r["group"] == tag, migrationSums)))[0]
}
sums.push(currRow) // push row to output table
}
return sums
}
Insert cell
Insert cell
z.print(df_summed) // print out results table
Insert cell
Insert cell
// Load data from target URL
codesURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/0c14fa1b1f65a610491417b2894fceecde5f4260/country-and-continent-codes-list-csv_csv.csv'

Insert cell
// Load data from target URL
codes = d3.csv(codesURL)
Insert cell
//take a peek at the df!
z.head(10, codes)
Insert cell
// check out odd cases where one country belongs to multiple continents; Turkey is listed twice here, once in Europe and once in Asia
z.filter(r => r["Three_Letter_Country_Code"] == 'TUR', codes)
Insert cell
// df with only country codes and continent
// Note: checks above reveals that Turkey falls into both Europe and Asia bins
codesSlim = z.pipe(
[
z.dropCol("Continent_Code"),
z.dropCol("Country_Name"),
z.dropCol("Two_Letter_Country_Code"),
z.dropCol("Country_Number")
]
)(codes)
Insert cell
// check
z.head(10, codesSlim)
Insert cell
// Join this continent table to the summary migration by country table I produced earlier
// Note that this put Turkey in Europe because the Europe option comes up ahead of the Asia option in the continents table
// because we wish to retain all country rows for which we have data, we use df_summed as the left part of the join
migrationContJoin = z.merge(df_summed, codesSlim, "code", "Three_Letter_Country_Code", "_summed", "_codelist")
Insert cell
// print out joined table
z.head(35, migrationContJoin)
Insert cell
// group inflow again to a more aggregated level, this time by continent
inflowCont = z.gbSum("inflow", z.groupBy(d => d.Continent_Name, migrationContJoin))
Insert cell
// do the same for outflows
outflowCont = z.gbSum("outflow", z.groupBy(d => d.Continent_Name, migrationContJoin))
Insert cell
// do the same for stock
stockCont = z.gbSum("stock", z.groupBy(d => d.Continent_Name, migrationContJoin))
Insert cell
// join to create new dataset
continentInOut = z.merge(inflowCont, outflowCont, "group", "group", "_in","_out")
Insert cell
// join remaining set to create full new dataset
continentSumDraft = z.merge(continentInOut, stockCont, "group", "group", "_inOut","_stock")
Insert cell
//clean up the table
continentSum = {
// form frame for the output table, including the variables and continents to be listed
let contList = z.unique(z.getCol("group", continentSumDraft))
let columnNames = ["continent", "inflow", "outflow", "stock"]
let draftNames = ["sum_in","sum_out","sum"]
let new_df = []
// loop through each continent (ie, row) and fill out its inflow, outflow, and stock
// by indexing into the appropriate continent + variable combination in the continentSumDraft table
for (var i = 0; i < contList.length; i++)
{
let thisRow = {continent: contList[i],
inflow: z.getCol("sum_in", (z.filter(r => r["group"] == contList[i], continentSumDraft)))[0],
outflow: z.getCol("sum_out", (z.filter(r => r["group"] == contList[i], continentSumDraft)))[0],
stock: z.getCol("sum", (z.filter(r => r["group"] == contList[i], continentSumDraft)))[0]}
new_df.push(thisRow)
}
return new_df // return the new df
}
Insert cell
Insert cell
//print table
// Note that Turkey is included in Europe
// This is based on the "Country" column, not "country of birth/nationality"
z.print(continentSum)
Insert cell
Insert cell
Insert cell
Insert cell
// Now aggregating by countries of BIRTH
// first create labels to group on; labels are variable + birth country code
birthFlowLabel = z.deriveCol(r => (r.Variable + r.CO2), migration)
Insert cell
// add that groupby key from above to the full data table
migrationBirthKey = z.addCol("birthKey", birthFlowLabel, migration)
Insert cell
// check that the key showed up
z.tail(20, migrationBirthKey)
Insert cell
// I should have read entire pset first, and produced some items like a parsed DF up front so that it could be used by problems 3 and 5. Sorry for the redundancy here!

// pick out the flow variables
birthMigrationStat = z.pipe([
//subset df to just the relevant variables
z.filter(r => r['Variable'] == "Inflows of foreign population by nationality" ||
r['Variable'] == "Outflows of foreign population by nationality" ||
r['Variable'] == "Inflows of asylum seekers by nationality"),
// parse values into numbers
z.parseNums(['Value']),
])(migrationBirthKey)
Insert cell
// groupby and sum over country of birth and variable
birthMigrationSums = z.gbSum('Value', z.groupBy(d => d.birthKey, birthMigrationStat))
Insert cell
// fill new dataframe with the sums

birthSummed = {
// set up dataframe
let birthCountryList = z.unique(z.getCol("CO2", birthMigrationStat))
let countryNames = z.unique(z.getCol("Country of birth/nationality", birthMigrationStat))
let flowVars = z.unique(z.getCol("Variable", birthMigrationStat))
let colNames = ["birth_country_code", "birth_country", "inflow", "outflow", "asylum_inflow"]
let birthSums = []
// loop through each birth country and fill in its flow values
// by using the key values to index into the birthMigrationSums table
for (var i = 0; i < birthCountryList.length; i++)
{
let aRow = {birth_country_code: birthCountryList[i],
birth_country: countryNames[i],
inflow: 0,
outflow: 0,
asylum_inflow: 0}
let label = ""
for (var j = 0; j < flowVars.length; j++)
{
label = flowVars[j] + birthCountryList[i]
aRow[colNames[j+2]] = z.getCol("sum", (z.filter(r => r["group"] == label, birthMigrationSums)))[0]
}
birthSums.push(aRow)
}
return birthSums
}
Insert cell
Insert cell
z.print(birthSummed) // final table printed
Insert cell
Insert cell
// filter for variable = "Stock of foreign-born population by country of birth"
// filter for 2016 data only

stock2016 = z.pipe(
[
z.filter(r => r['Variable'] == "Stock of foreign-born population by country of birth" && r['Year'] == "2016"),
// convert values to numbers
z.parseNums(['Value'])
]
)(migrationBirthKey)

Insert cell
// aggregate 2016 stock data by COUNTRY OF BIRTH; otherwise there are no cases that fall into low immigration for Part C of this problem
stockForeign2016 = z.gbSum("Value", z.groupBy(d => d.CO2, stock2016))
Insert cell
Insert cell
z.print(stockForeign2016) // final table with only stock of foreign migrants by country of birth, and 2016 info, summed by country of birth/nationality
Insert cell
Insert cell
// Table 1: high immigration, > 100,000
// split original table into 3 by filtering for the appropriate conditions
high_immigration = z.filter(r => r["sum"] > 100000, stockForeign2016)
Insert cell
// Table 2: low immigration, > 50,000 & < 100,000
medium_immigration = z.filter(r => r["sum"] > 50000 && r["sum"] < 100000, stockForeign2016)
Insert cell
//Low Immigration < 50,000
low_immigration = z.filter(r => r["sum"] < 50000, stockForeign2016)
Insert cell
// print high immigration table
z.print(high_immigration)
Insert cell
// print medium immigration table
z.print(medium_immigration)
Insert cell
// print low immigration table
z.print(low_immigration)
Insert cell
Insert cell
// This can be built using the table I generated for part A of Question 5 above, which aggregated each type of flow (in, out, asylum flows) by country of birth/nationality

netFlow = {
// set up the df
let ccodeList = z.unique(z.getCol("birth_country_code", birthSummed))
let cList = z.unique(z.getCol("birth_country", birthSummed))
let df = []

// loop through each birth country, copy down its inflow and outflow,
// then calculate net inflow = inflow - outflow
for (var i = 0; i < cList.length; i++)
{
let inVal = z.getCol("inflow", (z.filter(r => r["birth_country_code"] == ccodeList[i], birthSummed)))[0]
let outVal = z.getCol("outflow", (z.filter(r => r["birth_country_code"] == ccodeList[i], birthSummed)))[0]
df.push({
birth_country_code: ccodeList[i],
birth_country: cList[i],
inflow: inVal,
outflow: outVal,
inflow_outflow: (inVal - outVal)
})

}
return df
}

Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more