The following is a step by step instructions on how to replicate what I’ve created: https://plot.ly/~hianalytics/168
The hardest part is step 1 but the rest is straightforward.
Feel free to reach out with any questions!
Justin
justin@harborislandanalytics.com
0.1 Install Packages & Setup Twitter API
How To Link for Twitter API Setup
library(rmarkdown)
install.packages("twitteR", dependencies=T)
library(twitteR)
library(RCurl)
# Set SSL certs globally
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
reqURL <- "https://api.twitter.com/oauth/request_token"
accessURL <- "https://api.twitter.com/oauth/access_token"
authURL <- "https://api.twitter.com/oauth/authorize"
apiKey <- "YOURAPIKEY"
apiSecret <- "YOURAPIKEY"
twitCred <- OAuthFactory$new(consumerKey=apiKey,consumerSecret=apiSecret,requestURL=reqURL,accessURL=accessURL,authURL=authURL)
# Running the next line of code brings up a response in the console with a link. Follow the link then copy/paste the ID like the instructions say
twitCred$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))
# Verify your connection (results = TRUE if so)
registerTwitterOAuth(twitCred)
user <- getUser("plotlygraphs")
userFriends <- user$getFriends()
userFollowers <- user$getFollowers(n=1500)
userNeighbors <- union(userFollowers, userFriends)
userNeighbors.df = twListToDF(userNeighbors)
NOTE: I recommend that you save the data locally that you collected from Twitter. That way you don’t have to re-do the hardest step and call the API each time you re-start this analysis
setwd("/...") #set your working directory if not done already
write.csv(userNeighbors.df, "userNeighbors_df.csv")
userNeighbors.df <- read.csv("userNeighbors_df.csv")
0.2 Clean Data
#Remove 0's and Take Log
userNeighbors.df[userNeighbors.df=="0"]<-1
userNeighbors.df$logFollowersCount <- log(userNeighbors.df$followersCount)
userNeighbors.df$logFriendsCount <- log(userNeighbors.df$friendsCount)
userNeighbors.df$logStatusesCount <- log(userNeighbors.df$statusesCount)
#Isolate data for analysis
kObject.log <- data.frame(userNeighbors.df$logFriendsCount,userNeighbors.df$logFollowersCount)
0.3 Elbow Plot - Estimate Number of Centers
mydata <- kObject.log
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
Seems like the “elbow” in the graph above is ~ 4 so we’ll go with that in step 4…
0.4 k-means Clustering
##Run the K Means algorithm, specifying centers (4 centers)
user2Means.log <- kmeans(kObject.log, centers=4, iter.max=10, nstart=100)
##Add the vector of specified clusters back to the original vector as a factor
userNeighbors.df$cluster <- factor(user2Means.log$cluster)
0.5 Plot.ly Scatter Plot of Results
#Copy data frame
un.df <- userNeighbors.df
#Create column for tool-tip text
un.df$text <- paste("Handle: ", un.df$screenName, "<br>", "Followers: ", un.df$followersCount, "<br>", "Friends: ", un.df$friendsCount, "<br>", "Tweets: ", un.df$statusesCount, sep="")
#Isolate columns for plotting
un.df <- un.df[,c("cluster", "text", "logFollowersCount", "logFriendsCount", "followersCount", "friendsCount")]
#Load Plot.ly
library(plotly)
p <- plotly(username="USERNAME", key="PASSWORD")
0.5.1 Build Dataset for Plotly
data <- lapply(
seq(
length(
unique(userNeighbors.df$cluster)
)),
function(i){
list(
x = as.vector(subset(un.df, drop=TRUE, cluster == i, logFollowersCount)),
y = as.vector(subset(un.df, drop=TRUE, cluster == i, logFriendsCount)),
text = as.vector(subset(un.df, drop=TRUE, cluster == i, text)),
mode = "markers",
marker = list(
line = list(
color = "white",
width = 0.5
),
size = 8
),
type = "scatter"
)
}
)
#Format Layout
layout <- list(
showlegend = FALSE,
title = "@plotlygraphs Friends vs Followers (Top 1,500) - Log 10 Scale",
xaxis = list(
title = "Log Followers Count",
'autorange' = TRUE
),
yaxis = list(
title = "Log Friends Count",
'autorange' = TRUE
)
)
0.5.2 Push to Plot.ly, Open Browser to View Plot
response <- p$plotly(data, kwargs=list(layout=layout, filename='Clusters of @plotlygraphs Twitter Followers (Top 1,500)', fileopt="overwrite"))
url <- response$url
filename <- response$filename
browseURL(url)
0.6 Credits & Helpful Links
- http://rmarkdown.rstudio.com/index.html
- http://www.r-bloggers.com/cluster-your-twitter-data-with-r-and-k-means/
- http://rstudio-pubs-static.s3.amazonaws.com/5983_af66eca6775f4528a72b8e243a6ecf2d.html
- http://blog.plot.ly/post/71637573256/the-power-of-bubble-charts
- https://plot.ly/r/bubblecharts/
- https://plot.ly/r/line-and-scatter/