Published
Edited
Mar 3, 2020
Insert cell
md`# IAT355 Assignment 3`
Insert cell
md`## Libraries`
Insert cell
d3 = require("d3@5")
Insert cell
VegaLite = require('@observablehq/vega-lite')
Insert cell
dfjs = require('https://bundle.run/dataframe-js@1.3.2')
Insert cell
md`## Data Inspection`
Insert cell
md`Import the data using dataframe-js.`
Insert cell
dataframe = dfjs.DataFrame.fromCSV('https://gist.githubusercontent.com/jennyyyzou/958e8a439e1845dcc4948dbd352f2cc4/raw/65653eedd965cb7cbc04f486e61f3821c2c96005/Cameras.csv')
Insert cell
Insert cell
dataframe1 = dataframe.replace('0','',"max_resolution").replace('0','',"low_resolution").replace('0','',"effective_pixels").replace('0','',"zoom_wide").replace('0','',"zoom_tele").replace('0','',"normal_focus_range").replace('0','',"macro_focus_range").replace('0','',"storage").replace('0','',"weight").replace('0','',"dimensions").replace('0','',"price")
Insert cell
Insert cell
{
let maxR = dataframe1.countValue('', "max_resolution")
return md`* Found **${maxR}** missing values in **Max resolution**`
}
Insert cell
{
let lowR = dataframe1.countValue('', "low_resolution")
return md`* Found **${lowR}** missing values in **Low resolution**`
}
Insert cell
{
let pixel = dataframe1.countValue('', "effective_pixels")
return md`* Found **${pixel}** missing values in **Effective pixels**`
}
Insert cell
{
let zoomW = dataframe1.countValue('', "zoom_wide")
return md`* Found **${zoomW}** missing values in **Zoom wide (W)**`
}
Insert cell
{
let zoomT = dataframe1.countValue('', "zoom_tele")
return md`* Found **${zoomT}** missing values in **Zoom tele (T)**`
}
Insert cell
{
let focusNormal = dataframe1.countValue('', "normal_focus_range")
return md`* Found **${focusNormal}** missing values in **Normal focus range**`
}
Insert cell
{
let focusMacro = dataframe1.countValue('', "macro_focus_range")
return md`* Found **${focusMacro}** missing values in **Macro focus range**`
}
Insert cell
{
let storage = dataframe1.countValue('', "storage")
return md`* Found **${storage}** missing values in **Storage**`
}
Insert cell
{
let weight = dataframe1.countValue('', "weight")
return md`* Found **${weight}** missing values in **Weight (inc. batteries)**`
}
Insert cell
{
let dim = dataframe1.countValue('', "dimensions")
return md`* Found **${dim}** missing values in **Dimensions**`
}
Insert cell
{
let price = dataframe1.countValue('', "price")
return md`* Found **${price}** missing values in **Price**`
}
Insert cell
md`Convert the dataset to a new form with corrected data.`
Insert cell
cameras = dataframe1.toCollection()
Insert cell
md`Import dataset with d3.`
Insert cell
newcameras = d3.csv("https://gist.githubusercontent.com/jennyyyzou/958e8a439e1845dcc4948dbd352f2cc4/raw/65653eedd965cb7cbc04f486e61f3821c2c96005/Cameras.csv", function(d) {
return {
model : d.model,
company : d.company,
release_date : +d["release_date"],
max_resolution : +d["max_resolution"],
low_resolution : +d["low_resolution"],
effective_pixels : +d["effective_pixels"],
zoom_wide : +d["zoom_wide"],
zoom_tele : +d["zoom_tele"],
normal_focus_range : +d["normal_focus_range"],
macro_focus_range : +d["macro_focus_range"],
storage : +d.storage,
weight : +d.weight,
dimensions : +d.dimensions,
price : +d.price
};
})
Insert cell
md` ## 1. Find the maximum and minimum`
Insert cell
md` ### 1a). Low resolution`
Insert cell
maxLowResolution = d3.max(newcameras, d => d.low_resolution)
Insert cell
minLowResolution = d3.min(newcameras, d => d.low_resolution)
Insert cell
md` ### 1b). Storage`
Insert cell
maxStorage = d3.max(newcameras, d => d.storage)
Insert cell
minStorage = d3.min(newcameras, d => d.storage)
Insert cell
md` ### 1c). Price`
Insert cell
maxPrice = d3.max(newcameras, d => d.price)
Insert cell
minPrice = d3.min(newcameras, d => d.price)
Insert cell
md` ## 2. Find the sum`
Insert cell
md` ### 2a). Max resolution`
Insert cell
sumMaxResolution = d3.sum(newcameras, d => d.max_resolution)
Insert cell
md` ### 2b). Weight`
Insert cell
sumWeight = d3.sum(newcameras, d => d.weight)
Insert cell
md` ### 2c). Price`
Insert cell
sumPrice = d3.sum(newcameras, d => d.price)
Insert cell
md` ## 3. Find the average`
Insert cell
md` ### 3a). Max resolution`
Insert cell
VegaLite({
data: {values: newcameras},
mark: "bar",
encoding: {
x: {
bin: {"maxbins": 10},
field: "max_resolution",
type: "quantitative"
},
y: {
aggregate: "count",
type: "quantitative"
}
}
})
Insert cell
md` Because there are outliers near 0 and 5,600 that would make a rough histogram over lager number of bins, I chose 10 bins to show the histogram shape clearly. The histogram for Max resolution is **symmetric**, thus the mean will be considered as the average.`
Insert cell
averageMaxResolution = d3.mean(newcameras, d => d.max_resolution)
Insert cell
md` ### 3b). Low resolution`
Insert cell
VegaLite({
data: {values: newcameras},
mark: "bar",
encoding: {
x: {
bin: {"maxbins": 10},
field: "low_resolution",
type: "quantitative"
},
y: {
aggregate: "count",
type: "quantitative"
}
}
})
Insert cell
md` I chose 10 bins instead of smaller number to show the skewed shape clearly. The histogram for Low resolution is **left-skewed**, thus the median will be considered as the average. The Median can be a useful measure because it ignores outliers in this case.`
Insert cell
averageLowResolution = d3.median(newcameras, d => d.low_resolution)
Insert cell
md` ### 3c). Zoom tele(T)`
Insert cell
VegaLite({
data: {values: newcameras},
mark: "bar",
encoding: {
x: {
bin: {"maxbins": 8},
field: "zoom_tele",
type: "quantitative"
},
y: {
aggregate: "count",
type: "quantitative"
}
}
})
Insert cell
md` I chose 8 bins instead of smaller number to show the skewed shape clearly. The histogram for Zoom tele is **right-skewed**, thus the median will be considered as the average. The Median can be a useful measure because it just counts how many values there are, sorts them and then checks which value the one in the middle has.`
Insert cell
averageZoomTele = d3.median(newcameras, d => d.zoom_tele)
Insert cell
md` ### 3d). Normal focus range`
Insert cell
VegaLite({
data: {values: newcameras},
mark: "bar",
encoding: {
x: {
bin: {"maxbins": 10},
field: "normal_focus_range",
type: "quantitative"
},
y: {
aggregate: "count",
type: "quantitative"
}
}
})
Insert cell
md` I chose 10 bins over 20 bins because it would show the histogram shape clearly; however, 20 bins display a ambiguous shape that would be hard for analysis. The histogram for Normal focus range appears **symmetric** with same shape on each side, thus the mean will be considered as the average.`
Insert cell
averageNormalFocusRange = d3.mean(newcameras, d => d.normal_focus_range)
Insert cell
md` ### 3e). Weight`
Insert cell
VegaLite({
data: {values: newcameras},
mark: "bar",
encoding: {
x: {
bin: {"maxbins": 10},
field: "weight",
type: "quantitative"
},
y: {
aggregate: "count",
type: "quantitative"
}
}
})
Insert cell
md` I chose 10 bins instead of 8 bins to show the skewed shape in details. The histogram for Weight is **right-skewed**, thus the median will be considered as the average. The Median can be a useful measure because it just counts how many values there are, sorts them and then checks which value the one in the middle has.`
Insert cell
averageWeight = d3.median(newcameras, d => d.weight)
Insert cell
md` ### 3f). Dimensions`
Insert cell
VegaLite({
data: {values: newcameras},
mark: "bar",
encoding: {
x: {
bin: {"maxbins": 12},
field: "dimensions",
type: "quantitative"
},
y: {
aggregate: "count",
type: "quantitative"
}
}
})
Insert cell
md` I chose 12 bins instead of 10 bins to show the skewed shape in details spreading over the range. The histogram for Dimensions is **right-skewed**, thus the median will be considered as the average. The Median can be a useful measure because it just counts how many values there are, sorts them and then checks which value the one in the middle has.`
Insert cell
averageDimensions = d3.median(newcameras, d => d.dimensions)
Insert cell
md` ### 3g). Price`
Insert cell
VegaLite({
data: {values: newcameras},
mark: "bar",
encoding: {
x: {
bin: {"maxbins": 10},
field: "price",
type: "quantitative"
},
y: {
aggregate: "count",
type: "quantitative"
}
}
})
Insert cell
md` I chose 10 bins instead of 6 bins to show the slope clearly from left to right. The histogram for Price is **right-skewed**, thus the median will be considered as the average existing in the first bin(0-1000). The Median can be a useful measure because the median is smaller than the mean at this point and it ignores the outliers. `
Insert cell
averagePrice = d3.median(newcameras, d => d.price)
Insert cell
md` ## 4. Find the count`
Insert cell
md` ## Challenge and issues`
Insert cell
md`I began the project with cleaning the data with DataFrame; however, I have to re-import the data using d3 for calculation. When I am trying to change the type of dimension with javascripts built-in operators, it raises some frustrations for the naming convention, such as **release_date : +d["release_date"]**. which causes some frustrations when naming the dimensions. I also realize that it is hard to determine the shape for histogram because some of the shapes can be very arbitrary. Thus it is important to play around with different sizes for bins to look for the most obvious shape.`
Insert cell
md`## Reference`
Insert cell
md`https://observablehq.com/@sfu-iat355/data-inspection-and-cleaning`
Insert cell
md`https://observablehq.com/@sfu-iat355/week-3-getting-started-with-data`
Insert cell
md`https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/filter`
Insert cell
md`https://vega.github.io/vega-lite/docs/bin.html`
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more