IAT355 Assignment 3 / Jenny Zou

Jenny Zou

Workspace

Published

Edited

Mar 3, 2020

md`# IAT355 Assignment 3`

md`## Libraries`

d3 = require("d3@5")

VegaLite = require('@observablehq/vega-lite')

dfjs = require('https://bundle.run/dataframe-js@1.3.2')

md`## Data Inspection`

md`Import the data using dataframe-js.`

dataframe = dfjs.DataFrame.fromCSV('https://gist.githubusercontent.com/jennyyyzou/958e8a439e1845dcc4948dbd352f2cc4/raw/65653eedd965cb7cbc04f486e61f3821c2c96005/Cameras.csv')

dataframe1 = dataframe.replace('0','',"max_resolution").replace('0','',"low_resolution").replace('0','',"effective_pixels").replace('0','',"zoom_wide").replace('0','',"zoom_tele").replace('0','',"normal_focus_range").replace('0','',"macro_focus_range").replace('0','',"storage").replace('0','',"weight").replace('0','',"dimensions").replace('0','',"price")

{

let maxR = dataframe1.countValue('', "max_resolution")

return md`* Found **${maxR}** missing values in **Max resolution**`

}

{

let lowR = dataframe1.countValue('', "low_resolution")

return md`* Found **${lowR}** missing values in **Low resolution**`

}

{

let pixel = dataframe1.countValue('', "effective_pixels")

return md`* Found **${pixel}** missing values in **Effective pixels**`

}

{

let zoomW = dataframe1.countValue('', "zoom_wide")

return md`* Found **${zoomW}** missing values in **Zoom wide (W)**`

}

{

let zoomT = dataframe1.countValue('', "zoom_tele")

return md`* Found **${zoomT}** missing values in **Zoom tele (T)**`

}

{

let focusNormal = dataframe1.countValue('', "normal_focus_range")

return md`* Found **${focusNormal}** missing values in **Normal focus range**`

}

{

let focusMacro = dataframe1.countValue('', "macro_focus_range")

return md`* Found **${focusMacro}** missing values in **Macro focus range**`

}

{

let storage = dataframe1.countValue('', "storage")

return md`* Found **${storage}** missing values in **Storage**`

}

{

let weight = dataframe1.countValue('', "weight")

return md`* Found **${weight}** missing values in **Weight (inc. batteries)**`

}

{

let dim = dataframe1.countValue('', "dimensions")

return md`* Found **${dim}** missing values in **Dimensions**`

}

{

let price = dataframe1.countValue('', "price")

return md`* Found **${price}** missing values in **Price**`

}

md`Convert the dataset to a new form with corrected data.`

cameras = dataframe1.toCollection()

md`Import dataset with d3.`

newcameras = d3.csv("https://gist.githubusercontent.com/jennyyyzou/958e8a439e1845dcc4948dbd352f2cc4/raw/65653eedd965cb7cbc04f486e61f3821c2c96005/Cameras.csv", function(d) {

return {

model : d.model,

company : d.company,

release_date : +d["release_date"],

max_resolution : +d["max_resolution"],

low_resolution : +d["low_resolution"],

effective_pixels : +d["effective_pixels"],

zoom_wide : +d["zoom_wide"],

zoom_tele : +d["zoom_tele"],

normal_focus_range : +d["normal_focus_range"],

macro_focus_range : +d["macro_focus_range"],

storage : +d.storage,

weight : +d.weight,

dimensions : +d.dimensions,

price : +d.price

};

})

md` ## 1. Find the maximum and minimum`

md` ### 1a). Low resolution`

maxLowResolution = d3.max(newcameras, d => d.low_resolution)

minLowResolution = d3.min(newcameras, d => d.low_resolution)

md` ### 1b). Storage`

maxStorage = d3.max(newcameras, d => d.storage)

minStorage = d3.min(newcameras, d => d.storage)

md` ### 1c). Price`

maxPrice = d3.max(newcameras, d => d.price)

minPrice = d3.min(newcameras, d => d.price)

md` ## 2. Find the sum`

md` ### 2a). Max resolution`

sumMaxResolution = d3.sum(newcameras, d => d.max_resolution)

md` ### 2b). Weight`

sumWeight = d3.sum(newcameras, d => d.weight)

md` ### 2c). Price`

sumPrice = d3.sum(newcameras, d => d.price)

md` ## 3. Find the average`

md` ### 3a). Max resolution`

VegaLite({

data: {values: newcameras},

mark: "bar",

encoding: {

x: {

bin: {"maxbins": 10},

field: "max_resolution",

type: "quantitative"

y: {

aggregate: "count",

type: "quantitative"

}

})

md` Because there are outliers near 0 and 5,600 that would make a rough histogram over lager number of bins, I chose 10 bins to show the histogram shape clearly. The histogram for Max resolution is **symmetric**, thus the mean will be considered as the average.`

averageMaxResolution = d3.mean(newcameras, d => d.max_resolution)

md` ### 3b). Low resolution`

VegaLite({

data: {values: newcameras},

mark: "bar",

encoding: {

x: {

bin: {"maxbins": 10},

field: "low_resolution",

type: "quantitative"

y: {

aggregate: "count",

type: "quantitative"

}

})

md` I chose 10 bins instead of smaller number to show the skewed shape clearly. The histogram for Low resolution is **left-skewed**, thus the median will be considered as the average. The Median can be a useful measure because it ignores outliers in this case.`

averageLowResolution = d3.median(newcameras, d => d.low_resolution)

md` ### 3c). Zoom tele(T)`

VegaLite({

data: {values: newcameras},

mark: "bar",

encoding: {

x: {

bin: {"maxbins": 8},

field: "zoom_tele",

type: "quantitative"

y: {

aggregate: "count",

type: "quantitative"

}

})

md` I chose 8 bins instead of smaller number to show the skewed shape clearly. The histogram for Zoom tele is **right-skewed**, thus the median will be considered as the average. The Median can be a useful measure because it just counts how many values there are, sorts them and then checks which value the one in the middle has.`

averageZoomTele = d3.median(newcameras, d => d.zoom_tele)

md` ### 3d). Normal focus range`

VegaLite({

data: {values: newcameras},

mark: "bar",

encoding: {

x: {

bin: {"maxbins": 10},

field: "normal_focus_range",

type: "quantitative"

y: {

aggregate: "count",

type: "quantitative"

}

})

md` I chose 10 bins over 20 bins because it would show the histogram shape clearly; however, 20 bins display a ambiguous shape that would be hard for analysis. The histogram for Normal focus range appears **symmetric** with same shape on each side, thus the mean will be considered as the average.`

averageNormalFocusRange = d3.mean(newcameras, d => d.normal_focus_range)

md` ### 3e). Weight`

VegaLite({

data: {values: newcameras},

mark: "bar",

encoding: {

x: {

bin: {"maxbins": 10},

field: "weight",

type: "quantitative"

y: {

aggregate: "count",

type: "quantitative"

}

})

md` I chose 10 bins instead of 8 bins to show the skewed shape in details. The histogram for Weight is **right-skewed**, thus the median will be considered as the average. The Median can be a useful measure because it just counts how many values there are, sorts them and then checks which value the one in the middle has.`

averageWeight = d3.median(newcameras, d => d.weight)

md` ### 3f). Dimensions`

VegaLite({

data: {values: newcameras},

mark: "bar",

encoding: {

x: {

bin: {"maxbins": 12},

field: "dimensions",

type: "quantitative"

y: {

aggregate: "count",

type: "quantitative"

}

})

md` I chose 12 bins instead of 10 bins to show the skewed shape in details spreading over the range. The histogram for Dimensions is **right-skewed**, thus the median will be considered as the average. The Median can be a useful measure because it just counts how many values there are, sorts them and then checks which value the one in the middle has.`

averageDimensions = d3.median(newcameras, d => d.dimensions)

md` ### 3g). Price`

VegaLite({

data: {values: newcameras},

mark: "bar",

encoding: {

x: {

bin: {"maxbins": 10},

field: "price",

type: "quantitative"

y: {

aggregate: "count",

type: "quantitative"

}

})

md` I chose 10 bins instead of 6 bins to show the slope clearly from left to right. The histogram for Price is **right-skewed**, thus the median will be considered as the average existing in the first bin(0-1000). The Median can be a useful measure because the median is smaller than the mean at this point and it ignores the outliers. `

averagePrice = d3.median(newcameras, d => d.price)

md` ## 4. Find the count`

md` ## Challenge and issues`

md`I began the project with cleaning the data with DataFrame; however, I have to re-import the data using d3 for calculation. When I am trying to change the type of dimension with javascripts built-in operators, it raises some frustrations for the naming convention, such as **release_date : +d["release_date"]**. which causes some frustrations when naming the dimensions. I also realize that it is hard to determine the shape for histogram because some of the shapes can be very arbitrary. Thus it is important to play around with different sizes for bins to look for the most obvious shape.`

md`## Reference`

md`https://observablehq.com/@sfu-iat355/data-inspection-and-cleaning`

md`https://observablehq.com/@sfu-iat355/week-3-getting-started-with-data`

md`https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/filter`

md`https://vega.github.io/vega-lite/docs/bin.html`

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.

Learn more