z_API‎ > ‎DataExport API‎ > ‎

Rでのデータ抽出

Rでのデータ抽出用スクリプト
weighted sortを模した計算式を含めた。

使いかたはこちrの関連記事で。



library(RCurl)
library(XML)

getAuth <- function(email,pw){
  urlAc <- "https://www.google.com/accounts/ClientLogin"
                                        #method <- "post"
  rs <- postForm(urlAc, .params = list(accountType="GOOGLE", Email=email, Passwd=pw, service="analytics", source="rganalytics"))
  sub("\\n","",sub(".*Auth\\=","",rs))
}


getAccounts <- function(auth){
  auth.plus <- paste("GoogleLogin Auth=",auth,sep="")

  #get all webProperties
  pr.h <- basicTextGatherer()
  pr.url <- "https://www.google.com/analytics/feeds/datasources/ga/accounts/~all/webproperties/~all/profiles"
  curlPerform(url=pr.url, .opts=curlOptions(httpheader=c(Authorization=auth.plus)), writefunction=pr.h$update)
  pr.body <- pr.h$value()
  pr.xml <- xmlParse(pr.body)
  pr.ids <- xpathSApply(pr.xml, "//atom:entry/dxp:property[@name='ga:profileId']",quote(xmlGetAttr(x,'value')), namespaces=c(atom='http://www.w3.org/2005/Atom',dxp='http://schemas.google.com/analytics/2009'))
  pr.names <- xpathSApply(pr.xml, "//atom:entry/dxp:property[@name='ga:profileName']",quote(xmlGetAttr(x,'value')), namespaces=c(atom='http://www.w3.org/2005/Atom',dxp='http://schemas.google.com/analytics/2009'))
  pr.ac.ids <- xpathSApply(pr.xml, "//atom:entry/dxp:property[@name='ga:accountId']",quote(xmlGetAttr(x,'value')), namespaces=c(atom='http://www.w3.org/2005/Atom',dxp='http://schemas.google.com/analytics/2009'))
  print("get profiles and now getting accounts data")
  #get all accounts
  ac.h <- basicTextGatherer()
  ac.url <- "https://www.google.com/analytics/feeds/datasources/ga/accounts"
  curlPerform(url=ac.url, .opts=curlOptions(httpheader=c(Authorization=auth.plus)), writefunction=ac.h$update)
  ac.body <- ac.h$value()
  ac.xml <- xmlParse(ac.body)
  ac.names <- xpathSApply(ac.xml, "//atom:entry/dxp:property[@name='ga:accountName']", quote(xmlGetAttr(x,"value")), namespaces=c(atom='http://www.w3.org/2005/Atom',dxp='http://schemas.google.com/analytics/2009'))
  ac.ids <- xpathSApply(ac.xml, "//atom:entry/dxp:property[@name='ga:accountId']", quote(xmlGetAttr(x,"value")), namespaces=c(atom='http://www.w3.org/2005/Atom',dxp='http://schemas.google.com/analytics/2009'))
  print(paste("accounts",length(ac.ids), "  ", "profiles", length(pr.ids)))
  print("アカウント毎に一つずつプロファイル表示")
  pr.dt <- data.frame(pr.ids, pr.names, pr.ac.ids, stringsAsFactors=FALSE)
  ac.dt <- data.frame(ac.ids, ac.names, stringsAsFactors=FALSE)
  rt <- merge(ac.dt,pr.dt,by.x= "ac.ids", by.y="pr.ac.ids")
  print(rt[rt$pr.ids %in% tapply(rt$pr.ids,rt$ac.ids,min),])
  rt
  #dt <- as.data.frame(matrix(NA, nrow=length(entries), ncol=3))
}
myurl <- "https://www.google.com/analytics/feeds/data?ids=ga%3A30547051&metrics=ga%3Avisits&dimensions=ga%3Adate%2Cga%3Ahour&start-date=2010-09-13&end-date=2010-09-27&max-results=1000"



getData <- function(
                    auth, id, dimensions=NA, metrics=NA,segment=NA, filters=NA,
                    sort=NA, start.date=strftime((ga.today-30),"%Y-%m-%d"), end.date=strftime(ga.today,"%Y-%m-%d"), start.index=NA, max.results=NA){
  url <- "https://www.google.com/analytics/feeds/data?"
  ga.today <- Sys.Date()
  ids <- paste("ids=ga:",id,sep="")
  dimp <- NULL
  dmlists <- NULL
  mtlists <- NULL

  if(!is.na(dimensions[1])){
    p.dimensions <- paste("dimensions=",
                 paste(sapply(dimensions,function(x){paste("ga:",x,sep="")}),collapse=","),
                        sep="")
  }else{
    p.dimensions <- ""
  }
  if(is.na(metrics[1])){
    print("at least a metric")
    return
  }else{
    p.metrics <- paste("metrics=",
                 paste(sapply(metrics,function(x){paste("ga:",x,sep="")}),collapse=","),
                 sep="")
  }
  ifelse(!is.na(segment), p.segment <- paste("segment=",segment,sep=""), p.segment <- "")
  ifelse(!is.na(filters), p.filters <- paste("filters=",paste("ga:",filters,sep=""),sep=""),p.filters <- "")
  ifelse((!is.na(sort)),  p.sort <- paste("sort=",sort,sep=""), p.sort <- "")
  ifelse(!is.na(start.index),
         p.start.index <- paste("start-index=",start.index,sep=""),
         p.start.index <- "")
  ifelse(!is.na(max.results),
         p.max.results <- paste("max-results=",max.results,sep=""),
         p.max.result <- "")
  p.start.date <- paste("start-date=",start.date,sep="")
  p.end.date <- paste("end-date=",end.date,sep="")
  pms <- c(ids, p.dimensions, p.metrics, p.segment, p.filters, p.sort, p.start.date, p.end.date, p.start.index, p.max.results)
  pm <- paste(pms[!is.na(pms)], collapse="&")
  url <- paste(url, pm,sep="")
  print(url)
  p.auth <- paste("GoogleLogin Auth=",auth,sep="")
  h <- basicTextGatherer()
  curlPerform(url=url, .opts=curlOptions(httpheader=c(Authorization=p.auth)), writefunction=h$update)
  body <- h$value()
  doc <- xmlParse(body)
  #print(metrics)
  mtlists <- lapply(metrics, function(d){
    path = paste("//atom:entry/dxp:metric[@name='ga:", d, "']",sep="")
    xpathSApply(doc, path, quote(xmlGetAttr(x,'value')), namespaces=c(atom='http://www.w3.org/2005/Atom',dxp='http://schemas.google.com/analytics/2009'))
  })
  dmlists <- lapply(dimensions, function(d){
    path = paste("//atom:entry/dxp:dimension[@name='ga:", d, "']",sep="")
    xpathSApply(doc, path, quote(xmlGetAttr(x,'value')), namespaces=c(atom='http://www.w3.org/2005/Atom',dxp='http://schemas.google.com/analytics/2009'))
  })
  totalResults <- xmlValue(getNodeSet(doc,
                            "//atom:feed/openSearch:totalResults/text()",
                            namespaces=c(atom='http://www.w3.org/2005/Atom',openSearch="http://a9.com/-/spec/opensearchrss/1.0/",dxp='http://schemas.google.com/analytics/2009'))[[1]])
  itemsPerPage <- xmlValue(getNodeSet(doc, "//atom:feed/openSearch:itemsPerPage/text()",
                            namespaces=c(atom='http://www.w3.org/2005/Atom',openSearch="http://a9.com/-/spec/opensearchrss/1.0/",dxp='http://schemas.google.com/analytics/2009'))[[1]])
  startIndex <- xmlValue(getNodeSet(doc,
                          "//atom:feed/openSearch:startIndex/text()",
                          namespaces=c(atom='http://www.w3.org/2005/Atom',openSearch="http://a9.com/-/spec/opensearchrss/1.0/",dxp='http://schemas.google.com/analytics/2009'))[[1]])
  print(paste("get(", startIndex,
              "-",
              (as.numeric(startIndex) - 1 + as.numeric(itemsPerPage[1])),
              ") in ",
              totalResults[1],
              sep=""))
  #print(paste("start.index","itemsPerPage[1]","totalResults[1]",startIndex,itemsPerPage[1],totalResults[1]))
  if(is.null(dmlists[1][[1]][1][[1]])){ #データ形式がよく分かってないxpathが空で変えるデータなんだが、、、
    dt <-  as.data.frame(mtlists,stringsAsFactors=FALSE)
  }else{
    dt <- cbind(as.data.frame(dmlists,stringsAsFactors=FALSE),
              as.data.frame(mtlists,stringsAsFactors=FALSE))
  }
  ifelse(is.na(dimensions),
         (colnames(dt) <- metrics),
         (colnames(dt) <- c(dimensions,metrics,recursive=TRUE)))
  dt[,metrics] <- sapply(dt[,metrics],as.numeric)
  print(paste(colnames(dt),collapse=","))
  if("date" %in% colnames(dt)) dt$date <- as.Date(dt$date,"%Y%m%d")
  if(startIndex == "1" & totalResults[1]>itemsPerPage[1]){
    print("残りデータを取得")
    bpoint <- as.numeric(itemsPerPage[1]) + 1
    for(i in seq(bpoint, as.numeric(totalResults[1]), by=10000)){
      #print(paste(filters,sort,segment,start.index, start.date, end.date, max.results))
      dt <- rbind(dt,
            getData(auth=auth,
                    id=id,
                    metrics=metrics,
                    dimensions=dimensions,
                    filters=filters,
                    sort=sort,
                    segment=segment,
                    start.index=as.character(i),
                    start.date=start.date,
                    end.date=end.date,
                    max.results=10000))
    }
  }
  dt <- dt[rev(order(dt[metrics[1]])),]
  print(head(dt,5))
  dt
}


#加重平均計算用
weighted.mean <- function(data, elm, denom, prior.mean=NULL, prior.num=NULL, cname=NULL){
  if(is.null(prior.num)){
    prior.num = max(data[,denom])
  }
  if(is.null(prior.mean)){
    prior.mean <- sum(data[, elm])/sum(data[, denom])
  }
  print((prior.mean)*100)
  cname <- ifelse(is.null(cname),"w.rate", cname)
  data <- transform(data,
                    #w.rate.asis=(data[,elm]/data[,denom]),
                    tmp = (ifelse(data[,denom]==0,
                                   prior.mean,
                                   (data[,denom]/prior.num)*(data[,elm]/data[,denom]) +
                                   (1 - data[,denom]/prior.num)*(prior.mean)
                                   )





#以下は実行には直接いらない。データの項目を忘れた時用のもの

ga.dimensions = list(
  visitor=c("browser","city","country","date","day","daysSinceLastVisits","hostname","month","pageDepth","region","visitCount","visitLength","visitorTyep","week","year"),
  campaign=c("adContent","adGroup","adSlot","adSlotPosision","campaign","keyword","medium","referralPath","source"),
  content=c("pagePath","exitPagePath","landingPagePath","nextPagePath","pageTitle","previousPagePath","nextPagePath"),
  ecommerce=c("affiliation","daysToTransaction","productCategory","productName","productSku","transactionId","visitsToTransaction"),
  internal.search=c("searchCategory","searchDestination","searchKeyword","searchKeywordRefinement","searchUsed")
  ,custom.variable=c("customVarName1","customVarValue1"),
  events=c("eventCategory","eventAction","eventLabel")
  )

ga.metrics = list(
  visitor=c("bounces","entrances","exits","newVisits","pageviews","timeOnPage","timeOnSite","visitors","visits"),
  campaign=c("adClicks","adCost","CPC","CPM","CTR","impressions"),
  content=c("uniquePagviews"),
  ecommerce=c("itemRevenue","itemQuanity","transactions","transactionShipping","transactionTax","uniquePurchases"),
  internel.search=c("serchDepath","searchDuration","searchExits","searchRefinements","searchUniques","searchVisits"),
  goals=c("goal1Completions","goalCompletionsAll","goal1Starts","goal1Value","goalValueAll"),
  events=c("totalEvents","uniqueEvents","eventValue")
  )
      





Comments