Published
Edited
Apr 3, 2019
Insert cell
Insert cell
Insert cell
Insert cell
// Load the file with tweets in the Boston area (uploaded to Stellar)
viewof text = html`<input type=file * ">`
Insert cell
// loading Boston tweets file as text
tweetsStrings = Files.text(text)
Insert cell
// turn Boston tweets file into an array of tweet objects
tweetsDF = Object.values(JSON.parse(tweetsStrings))
Insert cell
Insert cell
//From the Boston tweets, get a count of unique locations that users reported on their profiles.
//This uses part of the script from Carlos' week-5a-web-APIs

count = {
//create new array to hold the locations and their counts
let tempList = []
//group Boston tweets by location
//then loop through the unique locations counting up the number of tweets in that group
let groupedLocations = z.groupBy(x => x.location, tweetsDF)
for (const key of Object.keys(groupedLocations)) {
tempList.push({location: key, count: groupedLocations[key].length})
}
return tempList //return the final df of locations and count of tweets from each location
}
Insert cell
// Framingham is spelled three different ways: "Framingham, ma" "Framingham, MA" or "Framingham, Massachusetts" Change these all to "Framingham, MA"
// Clean up other terms that have multiple spellings as well

tweetsDFClean = {
//set up an array to hold final results
let cleanedArray = [];
//loop through the entire set of Boston tweets looking for cases where the same location
//is spelled in a different way, and convert those multiple spellings into one unique spelling
for (var i = 0; i < z.getCol("location", tweetsDF).length; i++) {
let correctLoc = "";
if(tweetsDF[i].location === "Framingham, ma" || tweetsDF[i].location === "Framingham, Massachusetts"){
correctLoc = "Framingham, MA";
}
else if(tweetsDF[i].location === "Natick, Ma"){
correctLoc = "Natick, MA";
}
else if(tweetsDF[i].location === "Massachusetts" || tweetsDF[i].location === "Massachusetts, MA"){
correctLoc = "Massachusetts, USA";
}
else if(tweetsDF[i].location === ""){
correctLoc = "N/A";
}
else if(tweetsDF[i].location === "shrewsbury MA"){
correctLoc = "Shrewsbury, MA";
}
else if(tweetsDF[i].location === "Marlborough, Ma."){
correctLoc = "Marlborough, MA";
}
else if(tweetsDF[i].location === "Sudbury, Massachusetts"){
correctLoc = "Sudbury, MA";
}
else if(tweetsDF[i].location === "Holliston, Massachusetts"){
correctLoc = "Holliston, MA";
}
else if(tweetsDF[i].location === "Northborough, Massachusetts "){
correctLoc = "Northborough, MA";
}
else if(tweetsDF[i].location === "New York"){
correctLoc = "New York, USA";
}
else if(tweetsDF[i].location === "Hudson, MA, USA"){
correctLoc = "Hudson, MA";
}
else {
correctLoc = tweetsDF[i].location
}
//set up a new row for the new dataframe that is exactly like the row in the existing Boston tweets dataframe
//but we add a column with the cleaned location
//push cleaned row into our final df
let currRow = tweetsDF[i];
currRow.locationCleaned = correctLoc;
cleanedArray.push(currRow);
}
return cleanedArray; //return final df
}

Insert cell
countCleaned = {
let count2 = []; //set up new results array
//group Boston tweets by location
//then loop through the unique CLEANED locations counting up the number of tweets in that group
let groupedLocations2 = z.groupBy(x => x.locationCleaned, tweetsDFClean)
for (const key of Object.keys(groupedLocations2)) {
count2.push({locationCleaned: key, count: groupedLocations2[key].length})
}
count2 = z.sortByCol('count', 'des', count2) //sort from most to least popular location
return count2; //return final array
}
Insert cell
// bar graph of tweets per self-reported locations associated with Boston tweets
vegalite({
data: {values: countCleaned},
mark: "bar",
encoding: {
x: {field: "locationCleaned", type: "nominal", sort:'*'},
y: {field: "count", type: "quantitative"},
color: {"field": "locationCleaned", "type": "nominal"}
}
})
Insert cell
Insert cell
// I ran a test scraper for 2 minutes that captured one geolocated tweet. My actual 10-minute scrape did not find any geolocated tweets, and my second attempt at a 10-minute scrape didn't either. So, I just concated my 2-minute and my 10-minute scrapes and captured that one geolocated tweet.
// This line filters for tweets that have lat/longs
geoLocatedTweet = z.filter(r => r.lat != null, tweetsDF)
Insert cell
// Plot of the lat/long of the one geocoded tweet in my dataset using vegalite
vegalite({
data: {values: geoLocatedTweet},
mark: "point",
encoding: {
x: {field: "lon", type: "quantitative", axis:{title: "Longitude"}},
y: {field: "lat", type: "quantitative", axis:{title: "Latitude"}}
}
})
Insert cell
Insert cell
// I looked for Tweets that mentioned bitcoin.
// this line uploads the resulting scraped file
viewof hashtag_text = html`<input type=file * ">`
Insert cell
// convert Bitcoin scrape file to strings
tweetHashtagText = Files.text(hashtag_text)
Insert cell
// convert Bitcoin scrape to array of objects
tweetsHashtag = Object.values(JSON.parse(tweetHashtagText))
Insert cell
Insert cell
// count tweets from each location in raw data
count3 = {
let a = [] //set up new array for holding results
//group Bitcoin tweets by location
//then loop through the unique locations counting up the number of tweets in that group
let loc = z.groupBy(x => x.location, tweetsHashtag)
for (const key of Object.keys(loc)) {
a.push({location: key, count: loc[key].length})
}
a = z.sortByCol('count', 'des', a) //sort from most to least popular locations
return a //return results array
}
Insert cell
//clean up locations by replacing variants of the same location name with just one location name
tweetsHashtagClean = {
let c = []; //set up array to hold results
// loop through each row of df containing Bitcoin tweets. For each tweet, check whether the location
// is a variant of another location. If so, replace the variants with a reasonable unique spelling
for (var i = 0; i < z.getCol("location", tweetsHashtag).length; i++) {
let loc = "";
if(tweetsHashtag[i].location === "earth" || tweetsHashtag[i].location === "Planet Earth" ||
tweetsHashtag[i].location === "Worldwide" || tweetsHashtag[i].location === "World" ||
tweetsHashtag[i].location === "Citizen of the World" || tweetsHashtag[i].location === "EARTHLING" ||
tweetsHashtag[i].location === "The World" || tweetsHashtag[i].location === "everywhere and nowhere" ||
tweetsHashtag[i].location === "Here" || tweetsHashtag[i].location === "International" ||
tweetsHashtag[i].location === "world" || tweetsHashtag[i].location === "Somewhere, Someplace " ||
tweetsHashtag[i].location === "The Earth"){
loc = "Earth";
}
else if(tweetsHashtag[i].location === "BlockChain" || tweetsHashtag[i].location === "Blockchain web"){
loc = "Blockchain";
}
else if(tweetsHashtag[i].location === "Vancouver Canada"){
loc = "Vancouver, British Columbia";
}
else if(tweetsHashtag[i].location === "United States 🇺🇸" || tweetsHashtag[i].location === "USA"){
loc = "United States";
}
else if(tweetsHashtag[i].location === "日本" || tweetsHashtag[i].location === "japan"){
loc = "Japan";
}
else if (tweetsHashtag[i].location === "U.K. London"){
loc = "London, England"
}
else if(tweetsHashtag[i].location === "NY" || tweetsHashtag[i].location === "New York, USA"){
loc = "New York, NY";
}
else if(tweetsHashtag[i].location === "San Francisco Bay" ){
loc = "San Francisco, CA";
}
else if(tweetsHashtag[i].location === "Los Angeles " || tweetsHashtag[i].location === "Los Angeles" ){
loc = "Los Angeles, CA";
}
else if(tweetsHashtag[i].location === "" ){
loc = "N/A";
}
else {
loc = tweetsHashtag[i].location;
}
//rows of new results df are same as the old Bitcoin tweets df rows but with cleaned locations added
let thisRow = tweetsHashtag[i];
thisRow.locationCleaned = loc;
c.push(thisRow);
}
return c; //return the cleaned df
}
Insert cell
count4 = {
//group and count, just like the counts we did before
let d = [];
let groupedloc = z.groupBy(x => x.locationCleaned, tweetsHashtagClean)
for (const key of Object.keys(groupedloc)) {
d.push({locationCleaned: key, count: groupedloc[key].length})
}
d = z.sortByCol('count', 'des', d)
return d;
}
Insert cell
// bar graph of count of Tweets from each location, using vegalite
// many places had only one tweet, so it doesn't really show up on the graph given that the scale maxes out at 200 (over 200 tweets came from profiles that did not list a location)
vegalite({
data: {values: count4},
mark: "bar",
encoding: {
y: {field: "locationCleaned", type: "nominal", sort:'*'},
x: {field: "count", type: "quantitative"},
//color: {"field": "locationCleaned", "type": "nominal"}
}
})
Insert cell
Insert cell
geoCoded = {
// filtered for tweets with lat longs
let a = z.filter(r => r.lat != null, tweetsHashtagClean);
//created new df just like the df containing the list of geocded tweets,
//but with a new column counting the number of hashtags used in the tweet
let b = [];
for (let i = 0; i < z.getCol("tweet_id", a).length; i++){
let aRow = a[i];
aRow.hashtag_count = a[i].hashtag.length;
b.push(aRow);
}
return b;
}
Insert cell
// Plot of the lat/long of the one geocoded tweet in my dataset
// color dots by number of hashtags in the tweet
vegalite({
data: {values: geoCoded},
mark: "point",
encoding: {
x: {field: "lon", type: "quantitative", axis:{title: "Longitude"}},
y: {field: "lat", type: "quantitative", axis:{title: "Latitude"}},
color: {"field": "hashtag_count", "type": "nominal"}
},
})
Insert cell
Insert cell
vegalite = require("@observablehq/vega-lite@0.1")
Insert cell
z = require('https://bundle.run/zebras@0.0.11')
Insert cell
d3 = require("d3-fetch@1")
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more