Published
Edited
Apr 1, 2019
Insert cell
Insert cell
Insert cell
Insert cell
// Loading necessary library
z = require('https://bundle.run/zebras@0.0.11')
Insert cell
// Loading necessary library
d3 = require("d3")
Insert cell
// Target appropriate URL for dataset
migrationURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/6dff0d45ed26352a7a3c7afb4ace0bad1ce8ba20/MIG_18022019163824069.csv'
Insert cell
// Load data from target URL
migration = d3.csv(migrationURL)
Insert cell
// check out dimensions of our dataset
shape = ({rows: z.getCol('CO2', migration).length, columns:Object.keys(migration[0]).length})
Insert cell
// examine the dataset
z.head(20, migration)
Insert cell
Insert cell
// Specify the target column, "Country of birth/nationality," then get the unique values within it
z.unique(z.getCol('Country of birth/nationality', migration))
Insert cell
Insert cell
//overall process:
// create groupby key by concatenating 'country' and 'variable'
// reduce df to the 3 variables we want to look at
// parse values into numbers so that we can sum over them
// group the reduced/subsetted df by country & variable, summing over the country_variable groups (this aggregates over the years)
// convert those sums from a long into a wide table
// NOTE: in this part, we sum over the "Country" variable, not the "Country of birth/nationality" variable that we aggregate over for problems 5-6. This is because the problem says "per country" in bold. sorry if that was a misinterpretation of the problem.

// create groupby key from "Country" and "Variable"
groupLabel = z.deriveCol(r => (r.Variable + r.Country), migration)
Insert cell
// add groupby key to the full migration table
migrationKeyed = z.addCol("key", groupLabel, migration)
Insert cell
// check to make sure key is formed correctly and attended to the main table properly
z.tail(10, migrationKeyed)
Insert cell
// subset to just the variables we want to consider, and convert "Value" to numbers so that we can add them

countryStat = z.pipe([
//subset df to just the relevant variables
z.filter(r => r['Variable'] == "Inflows of foreign population by nationality" ||
r['Variable'] == "Outflows of foreign population by nationality" ||
r['Variable'] == "Stock of foreign-born population by country of birth"),
// parse values into numbers
z.parseNums(['Value']),
])(migrationKeyed)
Insert cell
// groupby to produce a sum for each key (ie, each country and variable combination)
migrationSums = z.gbSum("Value", z.groupBy(d => d.key, countryStat))
Insert cell
// Produces a new table where each row is a country listed in the migration table's "Country" column (NOT 'Country of birth/nationality' column)
// This table also includes 3 additional data columns containing the sum of inflows, sum of outflows, or sum of stocks for that country. For each country-variable combination, we are summing over all other factors including year, gender, country of birth/nationality

df_summed = {
// create temp variables to hold the structure of the new df, ie the countries and variables to be included
let countryList = z.unique(z.getCol("Country", countryStat))
let countryCodes = z.unique(z.getCol("COU", countryStat))
let varList = z.unique(z.getCol("Variable", countryStat))
let columnNames = ["country", "inflow", "outflow", "stock"]
let sums = []
// loop over all countries
// for each country, add a row to output table with the country's total name, code, inflow, outflow, and stock
for (var i = 0; i < countryList.length; i++)
{
let currRow = {country: countryList[i],
code: countryCodes[i],
inflow: 0,
outflow: 0,
stock: 0}
// to fill out the inflow, outflow, and stock information,
// create the variable_country key unique to that value and index into the migrationSums table
// to fetch that value and add it tto our output ttable
let tag = ""
for (var j = 0; j < varList.length; j++)
{
tag = varList[j] + countryList[i]
currRow[columnNames[j+1]] = z.getCol("sum", (z.filter(r => r["group"] == tag, migrationSums)))[0]
}
sums.push(currRow) // push row to output table
}
return sums
}
Insert cell
Insert cell
z.print(df_summed) // print out results table
Insert cell
Insert cell
// Load data from target URL
codesURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/0c14fa1b1f65a610491417b2894fceecde5f4260/country-and-continent-codes-list-csv_csv.csv'

Insert cell
// Load data from target URL
codes = d3.csv(codesURL)
Insert cell
//take a peek at the df!
z.head(10, codes)
Insert cell
// check out odd cases where one country belongs to multiple continents; Turkey is listed twice here, once in Europe and once in Asia
z.filter(r => r["Three_Letter_Country_Code"] == 'TUR', codes)
Insert cell
// df with only country codes and continent
// Note: checks above reveals that Turkey falls into both Europe and Asia bins
codesSlim = z.pipe(
[
z.dropCol("Continent_Code"),
z.dropCol("Country_Name"),
z.dropCol("Two_Letter_Country_Code"),
z.dropCol("Country_Number")
]
)(codes)
Insert cell
// check
z.head(10, codesSlim)
Insert cell
// Join this continent table to the summary migration by country table I produced earlier
// Note that this put Turkey in Europe because the Europe option comes up ahead of the Asia option in the continents table
// because we wish to retain all country rows for which we have data, we use df_summed as the left part of the join
migrationContJoin = z.merge(df_summed, codesSlim, "code", "Three_Letter_Country_Code", "_summed", "_codelist")
Insert cell
// print out joined table
z.head(35, migrationContJoin)
Insert cell
// group inflow again to a more aggregated level, this time by continent
inflowCont = z.gbSum("inflow", z.groupBy(d => d.Continent_Name, migrationContJoin))
Insert cell
// do the same for outflows
outflowCont = z.gbSum("outflow", z.groupBy(d => d.Continent_Name, migrationContJoin))
Insert cell
// do the same for stock
stockCont = z.gbSum("stock", z.groupBy(d => d.Continent_Name, migrationContJoin))
Insert cell
// join to create new dataset
continentInOut = z.merge(inflowCont, outflowCont, "group", "group", "_in","_out")
Insert cell
// join remaining set to create full new dataset
continentSumDraft = z.merge(continentInOut, stockCont, "group", "group", "_inOut","_stock")
Insert cell
//clean up the table
continentSum = {
// form frame for the output table, including the variables and continents to be listed
let contList = z.unique(z.getCol("group", continentSumDraft))
let columnNames = ["continent", "inflow", "outflow", "stock"]
let draftNames = ["sum_in","sum_out","sum"]
let new_df = []
// loop through each continent (ie, row) and fill out its inflow, outflow, and stock
// by indexing into the appropriate continent + variable combination in the continentSumDraft table
for (var i = 0; i < contList.length; i++)
{
let thisRow = {continent: contList[i],
inflow: z.getCol("sum_in", (z.filter(r => r["group"] == contList[i], continentSumDraft)))[0],
outflow: z.getCol("sum_out", (z.filter(r => r["group"] == contList[i], continentSumDraft)))[0],
stock: z.getCol("sum", (z.filter(r => r["group"] == contList[i], continentSumDraft)))[0]}
new_df.push(thisRow)
}
return new_df // return the new df
}
Insert cell
Insert cell
//print table
// Note that Turkey is included in Europe
// This is based on the "Country" column, not "country of birth/nationality"
z.print(continentSum)
Insert cell
Insert cell
Insert cell
Insert cell
// Now aggregating by countries of BIRTH
// first create labels to group on; labels are variable + birth country code
birthFlowLabel = z.deriveCol(r => (r.Variable + r.CO2), migration)
Insert cell
// add that groupby key from above to the full data table
migrationBirthKey = z.addCol("birthKey", birthFlowLabel, migration)
Insert cell
// check that the key showed up
z.tail(20, migrationBirthKey)
Insert cell
// I should have read entire pset first, and produced some items like a parsed DF up front so that it could be used by problems 3 and 5. Sorry for the redundancy here!

// pick out the flow variables
birthMigrationStat = z.pipe([
//subset df to just the relevant variables
z.filter(r => r['Variable'] == "Inflows of foreign population by nationality" ||
r['Variable'] == "Outflows of foreign population by nationality" ||
r['Variable'] == "Inflows of asylum seekers by nationality"),
// parse values into numbers
z.parseNums(['Value']),
])(migrationBirthKey)
Insert cell
// groupby and sum over country of birth and variable
birthMigrationSums = z.gbSum('Value', z.groupBy(d => d.birthKey, birthMigrationStat))
Insert cell
// fill new dataframe with the sums

birthSummed = {
// set up dataframe
let birthCountryList = z.unique(z.getCol("CO2", birthMigrationStat))
let countryNames = z.unique(z.getCol("Country of birth/nationality", birthMigrationStat))
let flowVars = z.unique(z.getCol("Variable", birthMigrationStat))
let colNames = ["birth_country_code", "birth_country", "inflow", "outflow", "asylum_inflow"]
let birthSums = []
// loop through each birth country and fill in its flow values
// by using the key values to index into the birthMigrationSums table
for (var i = 0; i < birthCountryList.length; i++)
{
let aRow = {birth_country_code: birthCountryList[i],
birth_country: countryNames[i],
inflow: 0,
outflow: 0,
asylum_inflow: 0}
let label = ""
for (var j = 0; j < flowVars.length; j++)
{
label = flowVars[j] + birthCountryList[i]
aRow[colNames[j+2]] = z.getCol("sum", (z.filter(r => r["group"] == label, birthMigrationSums)))[0]
}
birthSums.push(aRow)
}
return birthSums
}
Insert cell
Insert cell
z.print(birthSummed) // final table printed
Insert cell
Insert cell
// filter for variable = "Stock of foreign-born population by country of birth"
// filter for 2016 data only

stock2016 = z.pipe(
[
z.filter(r => r['Variable'] == "Stock of foreign-born population by country of birth" && r['Year'] == "2016"),
// convert values to numbers
z.parseNums(['Value'])
]
)(migrationBirthKey)

Insert cell
// aggregate 2016 stock data by COUNTRY OF BIRTH; otherwise there are no cases that fall into low immigration for Part C of this problem
stockForeign2016 = z.gbSum("Value", z.groupBy(d => d.CO2, stock2016))
Insert cell
Insert cell
z.print(stockForeign2016) // final table with only stock of foreign migrants by country of birth, and 2016 info, summed by country of birth/nationality
Insert cell
Insert cell
// Table 1: high immigration, > 100,000
// split original table into 3 by filtering for the appropriate conditions
high_immigration = z.filter(r => r["sum"] > 100000, stockForeign2016)
Insert cell
// Table 2: low immigration, > 50,000 & < 100,000
medium_immigration = z.filter(r => r["sum"] > 50000 && r["sum"] < 100000, stockForeign2016)
Insert cell
//Low Immigration < 50,000
low_immigration = z.filter(r => r["sum"] < 50000, stockForeign2016)
Insert cell
// print high immigration table
z.print(high_immigration)
Insert cell
// print medium immigration table
z.print(medium_immigration)
Insert cell
// print low immigration table
z.print(low_immigration)
Insert cell
Insert cell
// This can be built using the table I generated for part A of Question 5 above, which aggregated each type of flow (in, out, asylum flows) by country of birth/nationality

netFlow = {
// set up the df
let ccodeList = z.unique(z.getCol("birth_country_code", birthSummed))
let cList = z.unique(z.getCol("birth_country", birthSummed))
let df = []

// loop through each birth country, copy down its inflow and outflow,
// then calculate net inflow = inflow - outflow
for (var i = 0; i < cList.length; i++)
{
let inVal = z.getCol("inflow", (z.filter(r => r["birth_country_code"] == ccodeList[i], birthSummed)))[0]
let outVal = z.getCol("outflow", (z.filter(r => r["birth_country_code"] == ccodeList[i], birthSummed)))[0]
df.push({
birth_country_code: ccodeList[i],
birth_country: cList[i],
inflow: inVal,
outflow: outVal,
inflow_outflow: (inVal - outVal)
})

}
return df
}

Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more