Retrieve Web Traffic Data using Alexa API
The Alexa Web Information Service API makes Alexa’s vast repository of information about the web traffic and structure of the web available to developers. The following figure illustrates weekly page views of google website collected through the Alexa Web Information Service API.
To retrieve web traffic data, you have to request queries using Alexa API. Alexa provides some sample codes. However, they are written in Java, PHP, PERL, and RUBY. Thus, I would like to introduce a sample code written in R to extract web traffic data interacting with Alexa Web Information Service.
Library
library(XML) library(RCurl) library(digest) library(base64) library(base64enc)
Interact with Alexa API
getTrafficData = function(site,StartNumList){
chulencode = function(a){
ret = curlEscape(a)
ret = gsub("%2D","-",ret) ret = gsub("%5F","_",ret) ret = gsub("%2E",".",ret) ret = gsub("%7E","~",ret) return(ret) }
getUrlInfo = function(arg){
temparg = list( AWSAccessKeyId = arg$accessKeyId, Action = arg$ActionName, Count = arg$NumReturn, ResponseGroup = arg$ResponseGroupName, SignatureMethod = arg$HashAlgorithm, SignatureVersion = arg$SigVersion, Start = arg$StartNum, Timestamp = getTimestamp(), Url = arg$site )
params = NULL params = paste(names(temparg[1]),"=",chulencode(as.character(temparg[1])),sep="")
for(i in 2:length(temparg)){ params = paste(params,paste(names(temparg[i]),"=",chulencode(as.character(temparg[i])),sep=""),sep="&") } return(params)
}
getTimestamp = function(){
ret = format(Sys.time(),"20%y-%m-%dT%H:%M:%S.000Z") return(ret)
}
generateSignature = function(queryParams){
sign = paste0("GET\n",tolower(arg$ServiceHost),"\n/\n",queryParams) sig = base64(hmac(arg$secretAccessKey,sign,'sha256',raw=T))
return(chulencode(sig)) } arg = list( ActionName = 'TrafficHistory', ResponseGroupName = 'History', ServiceHost = 'awis.amazonaws.com', NumReturn = 31, StartNum = NULL, SigVersion = '2', HashAlgorithm = 'HmacSHA256', accessKeyId = "", secretAccessKey = "", site = site )
hist_data = NULL for(i in 1:length(StartNumList)){ arg$StartNum = StartNumList[i] queryParams = getUrlInfo(arg)
sig = generateSignature(queryParams) url = paste0("http://",arg$ServiceHost,"/?",queryParams,"&Signature=",sig) raw = xmlParse(url)
temp_data = data.frame( Date = xpathSApply(raw,"//x:Date",xmlValue,namespaces = c(x="http://awis.amazonaws.com/doc/2005-07-11"),simplify = TRUE), PV = xpathSApply(raw,"//x:PageViews/x:PerMillion",xmlValue,namespaces = c(x="http://awis.amazonaws.com/doc/2005-07-11"),simplify = TRUE), APV = xpathSApply(raw,"//x:PageViews/x:PerUser",xmlValue,namespaces = c(x="http://awis.amazonaws.com/doc/2005-07-11"),simplify = TRUE), Rank = xpathSApply(raw,"//x:Rank",xmlValue,namespaces = c(x="http://awis.amazonaws.com/doc/2005-07-11"),simplify = TRUE), Reach= xpathSApply(raw,"//x:Reach/x:PerMillion",xmlValue,namespaces = c(x="http://awis.amazonaws.com/doc/2005-07-11"),simplify = TRUE) ) hist_data = rbind(hist_data,temp_data) } hist_data = hist_data[match(levels(factor(hist_data[,"Date"])),hist_data[,"Date"]),] rownames(hist_data) = hist_data[,"Date"]
ret = list(site=site,hist_data=hist_data) return(ret)
}
Execution
ylist = c("2007","2008","2009","2010","2011","2012","2013","2014") mlist = c("01","02","03","04","05","06","07","08","09","10","11","12") StartNumList = apply(cbind(expand.grid(ylist,mlist),"01"),1,paste,collapse="") StartNumList = as.numeric(sort(StartNumList)) StartNumList = StartNumList[which(StartNumList == "20070701"):which(StartNumList == "20140901")]
google = getTrafficData("google.com",StartNumList)