Combining Github Traffic Plots Across Repositories

This post will show how to use the RSelenium package to scrape your own github account to retrieve all that fun traffic data of clones and visits and create a single traffic plot for your account.

For the single file you can find it in this gist file.

Packages

library(RSelenium)
library(XML)
library(ggplot2)
library(reshape2)
library(plyr)
library(dplyr)

Fill in the relevant information for your account. The team is usually your username, but it can be different. The repos can be a vector and since we are going in the front door of the site we can access the private repositories too!

Setup

gh_user <- '<your github login name>'
gh_pass <- '<your github login password>'

gh_team <- '<team associated with account>'
repos <- '<repositories in team>'

The function

github_traffic <- function(gh_user,gh_pass,gh_team,repos){

#open the connection

rD <- RSelenium::rsDriver(verbose = FALSE)
remDr <- rD[["client"]]

#going to the first repo to invoke the login

remDr$navigate(sprintf('https://github.com/%s/%s/graphs/traffic',gh_team,repos[1]))

#entering the login information in the form and clicking the button. 

webElem <- remDr$findElement(using = 'id', value = "login_field")
webElem$setElementAttribute(attributeName = 'value',value = gh_user)
webElem <- remDr$findElement(using = 'id', value = "password")
webElem$setElementAttribute(attributeName = 'value',value = gh_pass)
webElem=remDr$findElement(using = 'xpath','//*[@id="login"]/form/div[4]/input[3]')
webElem$clickElement()
Sys.sleep(1)

# Retrieve the plots into an html
out <- plyr::llply(repos,function(repo){
  remDr$navigate(sprintf('https://github.com/%s/%s/graphs/traffic',gh_team,repo))
  Sys.sleep(1)
  out <- XML::htmlParse(remDr$getPageSource(),asText = TRUE)
  sapply(c('clones','visitors'),function(type){
  XML::getNodeSet(out,sprintf(sprintf('//*[@id="js-%s-graph"]/div/div[1]/svg/g/g',type)))
},simplify = FALSE,USE.NAMES = TRUE)
},.progress = 'text')

# set the names (llply doesnt)
names(out) <- repos

# that's it we dont need the connection anymore
remDr$close()
rD[["server"]]$stop()

# scrape the data from html into a data.frame

plot_data <- plyr::ldply(out,function(repo){
  plyr::mdply(names(repo),function(type){
    
    dat <- repo[[type]]
  
    if(is.null(dat)) return(NULL)
    
    # tick values we need for rescaling
    yticks_total <- as.numeric(sapply(getNodeSet(dat[[2]],'g'),XML::xmlValue))
    yticks_unique <- as.numeric(sapply(getNodeSet(dat[[5]],'g'),XML::xmlValue))
    
    x <- data.frame(type=type,
                    date = as.Date(sapply(getNodeSet(dat[[1]],'g'),XML::xmlValue),format = '%m/%d'),
                    total = as.numeric(sapply(getNodeSet(dat[[3]],'circle'),XML::xmlGetAttr,name = 'cy')),
                    unique = as.numeric(sapply(getNodeSet(dat[[4]],'circle'),XML::xmlGetAttr,name = 'cy')))
    
    # Because this is a d3.js object there are some technical details that
    # I'm skipping here, but in short the y values need to be rescaled 
    # to show the actual values that you need.
    x$total <- scales::rescale(x$total,rev(range(yticks_total)))
    x$unique <- scales::rescale(x$unique,rev(range(yticks_unique)))
    
    #rehape the data.frame from wide to long
    x%>%reshape2::melt(.,c('type','date'),variable.name=c('metric'))
  })
},.id='repo')

#Create the plot

ggplot(plot_data,aes(x=date,y=value,colour=repo))+
  geom_point()+geom_line()+
  facet_grid(type~metric,scales='free_y')+
  scale_x_date(date_breaks = "1 day",date_labels = "%m/%d")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=90),legend.position = 'top')+
  labs(title=sprintf('Github Team: %s',gh_team))
}

Run the function

traffic_plot <- github_traffic(gh_user=gh_user,
                               gh_pass=gh_pass,
                               gh_team=gh_team,
                               repos=repos)
traffic_plot

If the function fails for some reason this will release the port RSelenium is holding ransom.

rD <- RSelenium::rsDriver(verbose = FALSE,port=4444L)
remDr <- rD$client
remDr$close()
comments powered by Disqus