PSET 4 / rlluo1 | Observable

// import vegalite

vegalite = require("@observablehq/vega-lite@0.1")

// Loading necessary library

z = require('https://bundle.run/zebras@0.0.11')

// Loading necessary library

d3 = require("d3")

// migration data URL

migrationURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/6dff0d45ed26352a7a3c7afb4ace0bad1ce8ba20/MIG_18022019163824069.csv'

countryURL = 'https://gist.githubusercontent.com/cesandoval/b834ac93c07e03ec5205843b97f68017/raw/90951b60444376eebfcbbee9beca00c083f489f6/Metadata_Country_API_SM.POP.NETM_DS2_en_csv_v2_10473747.csv'

// load migration dataset

migrationRaw = d3.csv(migrationURL)

migration = z.parseNums(['Value','Year'], migrationRaw)

// load country dataset

country = d3.csv(countryURL)

//check out migration dataset

z.head(10, migration)

// check out shape of migration dataset

shape = ({rows: z.getCol('CO2', migration).length, columns:Object.keys(migration[0]).length})

//check out country dataset

shapeCountry = ({rows: z.getCol('Country Code', country).length, columns:Object.keys(country[0]).length})

//check out country dataset

z.head(20, country)

// Join based on country code column in the country dataset, and the CO2 code in the migration dataset

migrationCountryTest = z.merge(migration, country, "CO2", "Country Code", "_m", "_c")

//migrationCountryTest = z.merge(country, migration, "Country Code", "CO2", "_m", "_c")

// Note: I think z.merge is not a true left join--it only retains the first instance of each item of the key array. therefore, the resulting join table is much smaller than the left hand side dataframe being joined upon. See source code line 87 https://zebrasjs.com/merge.js.html

// try the join again using the custom function

migrationCountry = leftjoin(migration, country, "CO2", "Country Code")

// group on a key that I generate, CO2_variable, which is unique for every income group+variable combination

varCO2Sums = z.gbSum("Value", z.groupBy(r => [r["CO2"], r["Variable"]], migrationCountry))

// generate country, variable columns that aren't concated together, so that I can graph it

// include on it the country income level, region data

// begin by creating a join key

key = z.deriveCol(r => (r.CO2 + ","+ r.Variable), migrationCountry)

// then add column key to the migration data table, dropping the unnecessary columns

migrationCountryKeyed = z.addCol("key", key, z.pickCols(["CO2","IncomeGroup","Variable"], migrationCountry))

// join the variable, income group names to the aggregate table

incomeVar = leftjoin(varCO2Sums, migrationCountryKeyed, "group" , "key")

// drop the undefined income and variable rows

incomeVarDefined = z.pipe([

z.filter( r => r['Variable'] == "Inflows of foreign population by nationality" ||

r['Variable'] == "Outflows of foreign population by nationality" ||

r['Variable'] == "Stock of foreign-born population by country of birth" ||

r['Variable'] == "Inflows of asylum seekers by nationality" ||

r['Variable'] == "Stock of foreign population by nationality"||

r['Variable'] == "Acquisition of nationality by country of former nationality" ||

r['Variable'] == "Stock of foreign-born labour by country of birth" ||

r['Variable'] == "Stock of foreign labour by nationality"),

z.filter( r => r['IncomeGroup'] == "Low income" ||

r['IncomeGroup'] == "Lower middle income" ||

r['IncomeGroup'] == "Upper middle income" ||

r['IncomeGroup'] == "High income"

)

])(incomeVar)

incomeGroups = z.unique(z.getCol('IncomeGroup', incomeVarDefined))

migVar = z.unique(z.getCol("Variable",incomeVarDefined))

viewof histogramMatrix = embed({

title: "Immigration measures by income group",

vconcat:[{

repeat: {column: z.getCol("Variable",incomeVarDefined)},

spec:{

data: {values: z.filter}, //reformat data?

mark: "bar",

encoding: {

x: {

bin:{"binned": true, "step": 700000},

field:{"repeat":"column"},

type:"quantitative"

},

y: {

aggregate: "count",

field:"CO2",

type:"quantitative",

title: "Country Count"

}

},

{

repeat: {column: migVar},

spec:{

data: {values: z.filter(r => r["IncomeGroup"] === "Lower middle income", incomeVarDefined)},

mark: "bar",

encoding: {

x: {

bin:{"binned": true, "step": 700000},

field:{"repeat":"column"},

type:"quantitative"

},

y: {

aggregate: "count",

field:"CO2",

type:"quantitative",

title: "Country Count"

}

]

})

// how to get x-scales to be different across columns? can try vconcat on grouups of columns withsimilar scales!

vegalite({

data: {values: incomeVarDefined},

mark: "bar",

encoding: {

x: {

bin: {

"binned": true

},

field: "sum",

type: "quantitative",

axis: {"title": "Total"},

//scale: {"domain": [0,20000000]}

},

y: {

aggregate: "count",

field: "CO2",

type: "nominal",

axis: {"title": "Country count"}

},

row: {

field: "IncomeGroup",

type: "nominal"

},

column: {

field: "Variable",

type: "nominal",

}

})

viewof view_stackBar= embed({

title: "Immigration by income groups and migration categories",

}})

// followed Stacked guide: https://vega.github.io/vega-lite/docs/stack.html

vegalite({

data: {values: incomeVarDefined},

mark: "bar",

encoding: {

x: {

bin: {

"binned": true,

},

field: "sum",

type: "quantitative",

axis: {"title": "Total"},

//scale: {"domain": [0,20000000]}

},

y: {

aggregate: "count",

field: "CO2",

type: "nominal",

stack: "normalize",

axis: {"title": "Country count"}

},

color: {

field: "IncomeGroup",

type: "nominal"

},

column: {

field: "Variable",

type: "nominal"

}

})

// GDP per capita per country dataset, pulled from World Bank and loaded through Github

gdpURL = 'https://gist.githubusercontent.com/rlluo1/59046b73d6e0f5c5f3fb330b241007e3/raw/474b199788921546015bf3a12d5313b1c837b699/gdp_per_capita_current'

// load in data

gdpRaw = d3.csv(gdpURL)

gdp = z.parseNums(['2017_gdppc'], gdpRaw)

//check it out

z.head(20,gdp)

//in my migration dataset, select 2017 data only,

varCO22017 = z.filter(r => r["Year"] === 2017, migrationCountry)

// note that 2017 data only has 3 variables instead of 8

// there are still 210 CO2s, so we expect on order of 3*210 = 630 rows when we aggregate by country-variable

z.unique(z.getCol("Variable", varCO22017))

//then aggregate by CO2 & variable, ie, we sum over all the Countries for each Country of Birth

//now each CO2 - variable is a row

varCO22017Sums = z.gbSum("Value", z.groupBy(r => [r["CO2"], r["Variable"]], varCO22017))

//produce join key for original 2017 data table so that I can get the CO2 and variable names I need for sums table

key2017 = z.deriveCol(r => (r.CO2 + ","+ r.Variable), varCO22017)

//add key column to 2017 table, drop unnecessary columns

varCO22017Keyed = z.addCol("key", key2017, z.pickCols(["CO2","IncomeGroup","Region","Variable"], varCO22017))

//add 2017 table country data to sums table

varCO22017SumsDeet = leftjoin(varCO22017Sums, varCO22017Keyed, "group", "key")

//merge info from keyed 2017 table onto the sums table

migrationGDP2017 = leftjoin(varCO22017SumsDeet, gdp, "CO2", "CO2")

// only graphing 1 of 3 migration variables because axis scales don't match across the three

// picked the one variable most relevant to the question

// don't include total rows, ie CO2 == TOT

vegalite({

data: {values: z.filter(r => r["Variable"] === "Stock of foreign population by nationality" && r["CO2"] != "TOT", migrationGDP2017)},

mark: "point",

encoding: {

x: {

field: "sum",

type: "quantitative",

axis: {"title": "Stock of foreign/immigrant population"}

},

y: {

field: "2017_gdppc",

type: "quantitative",

axis: {"title": "2017 per capita GDP"}

},

color: {

field: "Region",

type: "nominal"

},

}

})

// same as above but dropping the outliers

vegalite({

data: {values: z.filter(r => r["Variable"] === "Stock of foreign population by nationality" &&

r["CO2"] != "TOT" &&

r["sum"] < 4000001 &&

r["2017_gdppc"] < 150000, migrationGDP2017)},

mark: "point",

encoding: {

x: {

field: "sum",

type: "quantitative",

axis: {"title": "Stock of foreign/immigrant population"}

},

y: {

field: "2017_gdppc",

type: "quantitative",

axis: {"title": "2017 per capita GDP"}

},

color: {

field: "Region",

type: "nominal"

},

}

})

// create new table in which each row is CO2_variable_year

// this needs aggregating because each CO2 has multiple rows for recipient country

varCO2AllYearsSums = z.gbSum("Value", z.groupBy(r => [r["CO2"], r["Variable"], r["Year"]], migration))

//join key

keyAllYears = z.deriveCol(r => (r.CO2 + ","+ r.Variable + "," + r.Year), migration)

//add key to migration table, drop unneeded columns

migrationYearKeyed = z.addCol("key", keyAllYears, z.pickCols(["CO2", "Variable", "Year"], migration))

//merge keyed migration table onto the sums table

migrationByYear = leftjoin(varCO2AllYearsSums, migrationYearKeyed, "group", "key")

// Note that only the stock figures and asylum inflow variables (3 variables total) had 2017 numbers

vegalite({

data: {values: z.filter(r => r["Variable"] === "Inflows of foreign population by nationality" &&

r["CO2"] != "TOT", migrationByYear)},

layer: [

{

mark: "point",

encoding: {

y: {

field: "sum",

type: "quantitative",

axis: {"title": "In migration"}

},

x: {

field: "Year",

type: "quantitative",

//data is not in date format so we just use quantitative type

axis: {"title": "Year"},

format: "",

scale: {"domain": [2000, 2017]}

},

}

},

{

mark: "line",

encoding: {

y: {

field: "sum",

type: "quantitative",

aggregate: "mean",

axis: {"title" : "In migration"}

},

x: {

field: "Year",

type: "quantitative",

//data is not in date format so we just use quantitative type

axis: {"title": "Year"},

format: "d",

scale: {"domain": [2000, 2017]}

},

color: {"value": "firebrick"}

}

]

})

Purpose-built for displays of data