Published
Edited
Jul 12, 2018
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
spam = datasets( 'SPAM_ASSASSIN' )
Insert cell
Insert cell
training = {
var arr = []
for ( var i = 0; i < spam.length; i++ ) {
if ( spam[i].group === 'easy-ham-1' || spam[i].group === 'spam-2' ) {
arr.push( spam[i] );
}
}
return arr;
}
Insert cell
Insert cell
test = {
var arr = []
for ( var i = 0; i < spam.length; i++ ) {
if ( spam[i].group === 'easy-ham-2' || spam[i].group === 'hard-ham-1' || spam[i].group === 'spam-1' ) {
arr.push( spam[i] );
}
}
return arr;
}
Insert cell
Insert cell
function extractBody( email ) {
// Remove the meta-information before two initial line breaks:
var LINE_BREAK_REGEXP = /[\r\n]{2}([\s\S]*)$/;
var text = email.match( LINE_BREAK_REGEXP )[ 1 ];
// Remove numbers and other special characters:
text = text.replace( /[0-9\-\+]/g, '' );
text = stdlib.lowercase( text );
text = stdlib.expandContractions( text );
text = stdlib.removePunctuation( text );
// Remove common words such as "the" or "and":
text = stdlib.removeWords( text, STOPWORDS );
return text;
}
Insert cell
Insert cell
STOPWORDS = datasets( 'STOPWORDS_EN' )
Insert cell
Insert cell
Insert cell
extractBody( training[9].text )
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
// WARNING: Operation will take some time to complete...

stdlib.inmap( training, function( x ) {
// Extract body and attach it as `body` property to `x`:
x.body = extractBody( x.text );
// Tokenize the body of the email and save the tokens inside of the `tokens` property of `x`:
x.tokens = stdlib.tokenize( x.body );
return x;
})
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
grouped = stdlib.groupBy( training, function( x ) {
if ( x.group === 'spam-2' ) {
return 'spam';
}
return 'nospam';
})
Insert cell
Insert cell
tokens = {
return {
'spam': stdlib.flattenArray( stdlib.pluck( grouped.spam, 'tokens' ) ),
'nospam': stdlib.flattenArray( stdlib.pluck( grouped.nospam, 'tokens' ) )
};
}
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
freqs = {
return {
'spam': stdlib.countBy( tokens.spam, stdlib.identity ),
'nospam': stdlib.countBy( tokens.nospam, stdlib.identity )
};
}
Insert cell
Insert cell
`"Money" occurs ${freqs.spam.money} in spam and ${freqs.nospam.money} in non-spam emails`
Insert cell
`"Work" occurs ${freqs.spam.work} in spam and ${freqs.nospam.work} in non-spam emails`
Insert cell
Insert cell
groupProbs = {
return {
'spam': 1396 / ( 1396 + 2500 ),
'nospam': 2500 / ( 1396 + 2500 )
};
}
Insert cell
Insert cell
function wordSpamProb( word ) {
var nk = freqs.spam[ word ] || 0;
return ( nk + 1 ) / ( tokens.spam.length + vocabSize );
}
Insert cell
Insert cell
function wordNospamProb( word ) {
var nk = freqs.nospam[ word ] || 0;
return ( nk + 1 ) / ( tokens.nospam.length + vocabSize );
}
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
function classifyEmail( text ) {
var body = extractBody( text );
var tokens = stdlib.tokenize( body );
var spamScore = stdlib.base.ln( groupProbs[ "spam" ] );
for ( let i = 0; i < tokens.length; i++ ) {
spamScore += stdlib.base.ln( wordSpamProb( tokens[i] ) );
}
var nospamScore = stdlib.base.ln( groupProbs[ "nospam" ] );
for ( let i = 0; i < tokens.length; i++ ) {
nospamScore += stdlib.base.ln( wordNospamProb( tokens[i] ) );
}
return spamScore > nospamScore ? 'spam' : 'nospam';
}
Insert cell
Insert cell
classifyEmail( test[ 1900 ].text )
Insert cell
test[1900].group
Insert cell
Insert cell
testAccuracy = {
var correct = 0;
for ( var i = 0; i < test.length; i++ ) {
var pred = classifyEmail( test[ i ].text );
if (
pred === 'spam' && test[ i ].group === 'spam-1' ||
pred === 'nospam' && ( test[ i ].group === 'easy-ham-2' || test[ i ].group === 'hard-ham-1' )
) {
correct += 1;
}
}
return correct / test.length;
}
Insert cell
trainingAccuracy = {
var correct = 0;
for ( var i = 0; i < training.length; i++ ) {
var pred = classifyEmail( training[ i ].text );
if (
pred === 'spam' && training[ i ].group === 'spam-2' ||
pred === 'nospam' && training[ i ].group === 'easy-ham-1'
) {
correct += 1;
}
}
return correct / training.length;
}
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
pltTruePDF = {
var x = stdlib.linspace( 0, 30, 200 );
var y = x.map( function( x ) { return truePDF( x ) } );
var plt = stdlib.plot([ x ], [ y ], {
'xLabel': 'Value',
'yLabel': 'Density',
'width': 600,
'xMax': 15,
'xMin': 0
})
return plt;
}
Insert cell
Insert cell
function truePDF( x ) {
return weight * stdlib.base.dists.normal.pdf( x, 2, 1 ) + (1-weight) * stdlib.base.dists.normal.pdf( x, 9, 2 );
}
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
function fhat( x, xs, K, h ) {
var out = 0;
var n = xs.length;
for ( var i = 0; i < n; i++ ) {
out += K( ( x - xs[i] ) / h );
}
return out / ( n*h );
}
Insert cell
Insert cell
xobs = [ 1, 3, 4, 9, 13 ]
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
// Set K to `stdlib.base.dists.normal.pdf.factory( 0.0, 1.0 )` to restore default
K = stdlib.base.dists.normal.pdf.factory( 0.0, 1.0 )
Insert cell
Insert cell
Insert cell
function guessBandwidth( x ) {
var A = stdlib.base.min( stdev( x ), iqr( x ) );
return 0.9 * A * stdlib.base.pow( x.length, -1/5 );
}
Insert cell
Insert cell
Insert cell
Insert cell
draws = {
var values = new Array( 100 );
stdlib.inmap( values, drawSample );
return values;
function drawSample() {
var x = stdlib.base.random.bernoulli( weight );
if ( x === 1 ) {
return stdlib.base.random.normal( 2.0, 1.0 );
}
return stdlib.base.random.normal( 9.0, 2.0 );
}
}
Insert cell
Insert cell
plt14 = {
var x = stdlib.linspace( 0, 14, 200 );
var h = guessBandwidth( draws );
var yhat = x.map( function( x ) { return fhat( x, draws, K, h ); } );
var y = x.map( function( x ) { return truePDF( x ) } );
var plt = stdlib.plot( [ x, x ], [ y, yhat ] );
plt.colors = [ 'red', 'blue' ];
plt.width = 600;
return plt;
}
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
function meanShift( xs, K, h ) {
var a = stdlib.copy( xs );
var n = xs.length;
for ( var j = 0; j < a.length; j++ ) {
for ( var s = 0; s < 300; s++ ) {
var num = 0.0;
for ( var i = 0; i < n; i++ ) {
num += xs[ i ] * K( ( a[ j ] - xs[ i ] ) / h );
}
var denom = 0.0;
for ( var i = 0; i < n; i++ ) {
denom += K( ( a[ j ] - xs[ i ] ) / h );
}
a[ j ] = num / denom;
}
}
return a;
}
Insert cell
centers = meanShift( draws, K, guessBandwidth( draws ) )
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more