K-Means Clustering of @plotlygraphs Twitter Followers

The following is a step by step instructions on how to replicate what I’ve created: https://plot.ly/~hianalytics/168

The hardest part is step 1 but the rest is straightforward.

Feel free to reach out with any questions!

Justin

justin@harborislandanalytics.com

0.1 Install Packages & Setup Twitter API

How To Link for Twitter API Setup

library(rmarkdown)
install.packages("twitteR", dependencies=T)
library(twitteR)
library(RCurl)

# Set SSL certs globally
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))

reqURL <- "https://api.twitter.com/oauth/request_token"
accessURL <- "https://api.twitter.com/oauth/access_token"
authURL <- "https://api.twitter.com/oauth/authorize"
apiKey <- "YOURAPIKEY"
apiSecret <- "YOURAPIKEY"
 
twitCred <- OAuthFactory$new(consumerKey=apiKey,consumerSecret=apiSecret,requestURL=reqURL,accessURL=accessURL,authURL=authURL)
# Running the next line of code brings up a response in the console with a link. Follow the link then copy/paste the ID like the instructions say
twitCred$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))
# Verify your connection (results = TRUE if so)
registerTwitterOAuth(twitCred)
user <- getUser("plotlygraphs")
userFriends <- user$getFriends()
userFollowers <- user$getFollowers(n=1500)
userNeighbors <- union(userFollowers, userFriends)
userNeighbors.df = twListToDF(userNeighbors)

NOTE: I recommend that you save the data locally that you collected from Twitter. That way you don’t have to re-do the hardest step and call the API each time you re-start this analysis

setwd("/...") #set your working directory if not done already
write.csv(userNeighbors.df, "userNeighbors_df.csv") 
userNeighbors.df <- read.csv("userNeighbors_df.csv")

0.2 Clean Data

#Remove 0's and Take Log
userNeighbors.df[userNeighbors.df=="0"]<-1
userNeighbors.df$logFollowersCount <- log(userNeighbors.df$followersCount)
userNeighbors.df$logFriendsCount <- log(userNeighbors.df$friendsCount)
userNeighbors.df$logStatusesCount <- log(userNeighbors.df$statusesCount) 
#Isolate data for analysis
kObject.log <- data.frame(userNeighbors.df$logFriendsCount,userNeighbors.df$logFollowersCount)

0.3 Elbow Plot - Estimate Number of Centers

mydata <- kObject.log
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,
                                     centers=i)$withinss)

plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

plot of chunk unnamed-chunk-7

Seems like the “elbow” in the graph above is ~ 4 so we’ll go with that in step 4…


0.4 k-means Clustering

##Run the K Means algorithm, specifying centers (4 centers)
user2Means.log <- kmeans(kObject.log, centers=4, iter.max=10, nstart=100)

##Add the vector of specified clusters back to the original vector as a factor
userNeighbors.df$cluster <- factor(user2Means.log$cluster)

0.5 Plot.ly Scatter Plot of Results

#Copy data frame
un.df <- userNeighbors.df

#Create column for tool-tip text
un.df$text <- paste("Handle: ", un.df$screenName, "<br>", "Followers: ", un.df$followersCount, "<br>", "Friends: ", un.df$friendsCount, "<br>", "Tweets: ", un.df$statusesCount, sep="")

#Isolate columns for plotting
un.df <- un.df[,c("cluster", "text", "logFollowersCount", "logFriendsCount", "followersCount", "friendsCount")]
#Load Plot.ly
library(plotly)
p <- plotly(username="USERNAME", key="PASSWORD")

0.5.1 Build Dataset for Plotly

data <- lapply(
  seq(
    length(
      unique(userNeighbors.df$cluster)
    )), 
  function(i){
    list(
      x = as.vector(subset(un.df, drop=TRUE, cluster == i, logFollowersCount)),
      y = as.vector(subset(un.df, drop=TRUE, cluster == i, logFriendsCount)),
      text = as.vector(subset(un.df, drop=TRUE, cluster == i, text)),
      mode = "markers",
      marker = list(
        line = list(
          color = "white", 
          width = 0.5
        ), 
        size = 8
      ), 
      type = "scatter"      
    )
  }
)

#Format Layout
layout <- list(
  showlegend = FALSE,
  title = "@plotlygraphs Friends vs Followers (Top 1,500) - Log 10 Scale",
  xaxis = list(
    title = "Log Followers Count",
    'autorange' = TRUE
  ),
  yaxis = list(
    title = "Log Friends Count",
    'autorange' = TRUE
  )
)

0.5.2 Push to Plot.ly, Open Browser to View Plot

response <- p$plotly(data, kwargs=list(layout=layout, filename='Clusters of @plotlygraphs Twitter Followers (Top 1,500)', fileopt="overwrite"))
url <- response$url
filename <- response$filename
browseURL(url)