PSET 3 Wk 3 / rlluo1

rlluo1

Workspace

Published

Psets_11.454

Edited

Apr 1, 2019

// Loading necessary library

z = require('https://bundle.run/zebras@0.0.11')

// Loading necessary library

d3 = require("d3")

// Target appropriate URL for dataset

migrationURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/6dff0d45ed26352a7a3c7afb4ace0bad1ce8ba20/MIG_18022019163824069.csv'

// Load data from target URL

migration = d3.csv(migrationURL)

// check out dimensions of our dataset

shape = ({rows: z.getCol('CO2', migration).length, columns:Object.keys(migration[0]).length})

// examine the dataset

z.head(20, migration)

// Specify the target column, "Country of birth/nationality," then get the unique values within it

z.unique(z.getCol('Country of birth/nationality', migration))

//overall process:

// create groupby key by concatenating 'country' and 'variable'

// reduce df to the 3 variables we want to look at

// parse values into numbers so that we can sum over them

// group the reduced/subsetted df by country & variable, summing over the country_variable groups (this aggregates over the years)

// convert those sums from a long into a wide table

// NOTE: in this part, we sum over the "Country" variable, not the "Country of birth/nationality" variable that we aggregate over for problems 5-6. This is because the problem says "per country" in bold. sorry if that was a misinterpretation of the problem.

// create groupby key from "Country" and "Variable"

groupLabel = z.deriveCol(r => (r.Variable + r.Country), migration)

// add groupby key to the full migration table

migrationKeyed = z.addCol("key", groupLabel, migration)

// check to make sure key is formed correctly and attended to the main table properly

z.tail(10, migrationKeyed)

// subset to just the variables we want to consider, and convert "Value" to numbers so that we can add them

countryStat = z.pipe([

//subset df to just the relevant variables

z.filter(r => r['Variable'] == "Inflows of foreign population by nationality" ||

r['Variable'] == "Outflows of foreign population by nationality" ||

r['Variable'] == "Stock of foreign-born population by country of birth"),

// parse values into numbers

z.parseNums(['Value']),

])(migrationKeyed)

// groupby to produce a sum for each key (ie, each country and variable combination)

migrationSums = z.gbSum("Value", z.groupBy(d => d.key, countryStat))

// Produces a new table where each row is a country listed in the migration table's "Country" column (NOT 'Country of birth/nationality' column)

// This table also includes 3 additional data columns containing the sum of inflows, sum of outflows, or sum of stocks for that country. For each country-variable combination, we are summing over all other factors including year, gender, country of birth/nationality

df_summed = {

// create temp variables to hold the structure of the new df, ie the countries and variables to be included

let countryList = z.unique(z.getCol("Country", countryStat))

let countryCodes = z.unique(z.getCol("COU", countryStat))

let varList = z.unique(z.getCol("Variable", countryStat))

let columnNames = ["country", "inflow", "outflow", "stock"]

let sums = []

// loop over all countries

// for each country, add a row to output table with the country's total name, code, inflow, outflow, and stock

for (var i = 0; i < countryList.length; i++)

{

let currRow = {country: countryList[i],

code: countryCodes[i],

inflow: 0,

outflow: 0,

stock: 0}

// to fill out the inflow, outflow, and stock information,

// create the variable_country key unique to that value and index into the migrationSums table

// to fetch that value and add it tto our output ttable

let tag = ""

for (var j = 0; j < varList.length; j++)

{

tag = varList[j] + countryList[i]

currRow[columnNames[j+1]] = z.getCol("sum", (z.filter(r => r["group"] == tag, migrationSums)))[0]

}

sums.push(currRow) // push row to output table

}

return sums

}

z.print(df_summed) // print out results table

// Load data from target URL

codesURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/0c14fa1b1f65a610491417b2894fceecde5f4260/country-and-continent-codes-list-csv_csv.csv'

// Load data from target URL

codes = d3.csv(codesURL)

//take a peek at the df!

z.head(10, codes)

// check out odd cases where one country belongs to multiple continents; Turkey is listed twice here, once in Europe and once in Asia

z.filter(r => r["Three_Letter_Country_Code"] == 'TUR', codes)

// df with only country codes and continent

// Note: checks above reveals that Turkey falls into both Europe and Asia bins

codesSlim = z.pipe(

[

z.dropCol("Continent_Code"),

z.dropCol("Country_Name"),

z.dropCol("Two_Letter_Country_Code"),

z.dropCol("Country_Number")

]

)(codes)

// check

z.head(10, codesSlim)

// Join this continent table to the summary migration by country table I produced earlier

// Note that this put Turkey in Europe because the Europe option comes up ahead of the Asia option in the continents table

// because we wish to retain all country rows for which we have data, we use df_summed as the left part of the join

migrationContJoin = z.merge(df_summed, codesSlim, "code", "Three_Letter_Country_Code", "_summed", "_codelist")

// print out joined table

z.head(35, migrationContJoin)

// group inflow again to a more aggregated level, this time by continent

inflowCont = z.gbSum("inflow", z.groupBy(d => d.Continent_Name, migrationContJoin))

// do the same for outflows

outflowCont = z.gbSum("outflow", z.groupBy(d => d.Continent_Name, migrationContJoin))

// do the same for stock

stockCont = z.gbSum("stock", z.groupBy(d => d.Continent_Name, migrationContJoin))

// join to create new dataset

continentInOut = z.merge(inflowCont, outflowCont, "group", "group", "_in","_out")

// join remaining set to create full new dataset

continentSumDraft = z.merge(continentInOut, stockCont, "group", "group", "_inOut","_stock")

//clean up the table

continentSum = {

// form frame for the output table, including the variables and continents to be listed

let contList = z.unique(z.getCol("group", continentSumDraft))

let columnNames = ["continent", "inflow", "outflow", "stock"]

let draftNames = ["sum_in","sum_out","sum"]

let new_df = []

// loop through each continent (ie, row) and fill out its inflow, outflow, and stock

// by indexing into the appropriate continent + variable combination in the continentSumDraft table

for (var i = 0; i < contList.length; i++)

{

let thisRow = {continent: contList[i],

inflow: z.getCol("sum_in", (z.filter(r => r["group"] == contList[i], continentSumDraft)))[0],

outflow: z.getCol("sum_out", (z.filter(r => r["group"] == contList[i], continentSumDraft)))[0],

stock: z.getCol("sum", (z.filter(r => r["group"] == contList[i], continentSumDraft)))[0]}

new_df.push(thisRow)

}

return new_df // return the new df

}

//print table

// Note that Turkey is included in Europe

// This is based on the "Country" column, not "country of birth/nationality"

z.print(continentSum)

// Now aggregating by countries of BIRTH

// first create labels to group on; labels are variable + birth country code

birthFlowLabel = z.deriveCol(r => (r.Variable + r.CO2), migration)

// add that groupby key from above to the full data table

migrationBirthKey = z.addCol("birthKey", birthFlowLabel, migration)

// check that the key showed up

z.tail(20, migrationBirthKey)

// I should have read entire pset first, and produced some items like a parsed DF up front so that it could be used by problems 3 and 5. Sorry for the redundancy here!

// pick out the flow variables

birthMigrationStat = z.pipe([

//subset df to just the relevant variables

z.filter(r => r['Variable'] == "Inflows of foreign population by nationality" ||

r['Variable'] == "Outflows of foreign population by nationality" ||

r['Variable'] == "Inflows of asylum seekers by nationality"),

// parse values into numbers

z.parseNums(['Value']),

])(migrationBirthKey)

// groupby and sum over country of birth and variable

birthMigrationSums = z.gbSum('Value', z.groupBy(d => d.birthKey, birthMigrationStat))

// fill new dataframe with the sums

birthSummed = {

// set up dataframe

let birthCountryList = z.unique(z.getCol("CO2", birthMigrationStat))

let countryNames = z.unique(z.getCol("Country of birth/nationality", birthMigrationStat))

let flowVars = z.unique(z.getCol("Variable", birthMigrationStat))

let colNames = ["birth_country_code", "birth_country", "inflow", "outflow", "asylum_inflow"]

let birthSums = []

// loop through each birth country and fill in its flow values

// by using the key values to index into the birthMigrationSums table

for (var i = 0; i < birthCountryList.length; i++)

{

let aRow = {birth_country_code: birthCountryList[i],

birth_country: countryNames[i],

inflow: 0,

outflow: 0,

asylum_inflow: 0}

let label = ""

for (var j = 0; j < flowVars.length; j++)

{

label = flowVars[j] + birthCountryList[i]

aRow[colNames[j+2]] = z.getCol("sum", (z.filter(r => r["group"] == label, birthMigrationSums)))[0]

}

birthSums.push(aRow)

}

return birthSums

}

z.print(birthSummed) // final table printed

// filter for variable = "Stock of foreign-born population by country of birth"

// filter for 2016 data only

stock2016 = z.pipe(

[

z.filter(r => r['Variable'] == "Stock of foreign-born population by country of birth" && r['Year'] == "2016"),

// convert values to numbers

z.parseNums(['Value'])

]

)(migrationBirthKey)

// aggregate 2016 stock data by COUNTRY OF BIRTH; otherwise there are no cases that fall into low immigration for Part C of this problem

stockForeign2016 = z.gbSum("Value", z.groupBy(d => d.CO2, stock2016))

z.print(stockForeign2016) // final table with only stock of foreign migrants by country of birth, and 2016 info, summed by country of birth/nationality

// Table 1: high immigration, > 100,000

// split original table into 3 by filtering for the appropriate conditions

high_immigration = z.filter(r => r["sum"] > 100000, stockForeign2016)

// Table 2: low immigration, > 50,000 & < 100,000

medium_immigration = z.filter(r => r["sum"] > 50000 && r["sum"] < 100000, stockForeign2016)

//Low Immigration < 50,000

low_immigration = z.filter(r => r["sum"] < 50000, stockForeign2016)

// print high immigration table

z.print(high_immigration)

// print medium immigration table

z.print(medium_immigration)

// print low immigration table

z.print(low_immigration)

// This can be built using the table I generated for part A of Question 5 above, which aggregated each type of flow (in, out, asylum flows) by country of birth/nationality

netFlow = {

// set up the df

let ccodeList = z.unique(z.getCol("birth_country_code", birthSummed))

let cList = z.unique(z.getCol("birth_country", birthSummed))

let df = []

// loop through each birth country, copy down its inflow and outflow,

// then calculate net inflow = inflow - outflow

for (var i = 0; i < cList.length; i++)

{

let inVal = z.getCol("inflow", (z.filter(r => r["birth_country_code"] == ccodeList[i], birthSummed)))[0]

let outVal = z.getCol("outflow", (z.filter(r => r["birth_country_code"] == ccodeList[i], birthSummed)))[0]

df.push({

birth_country_code: ccodeList[i],

birth_country: cList[i],

inflow: inVal,

outflow: outVal,

inflow_outflow: (inVal - outVal)

})

}

return df

}

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.

Learn more