######################################################################################### # Some functions to quantify your Google Scholar citations page. # R functions Copyright (C) 2011 John Muschelli (jmuschel@jhsph.edu), Andrew Jaffe (ajaffe@jhsph.edu), # Jeffrey Leek (jtleek@gmail.com), and the Simply Statistics Blog # (http://simplystatistics.tumblr.com, http://twitter.com/simplystats) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details, see . # # # These functions depend on the packages: wordcloud, tm, sendmailR, and RColorBrewer. It will # attempt to install them if they are not installed when you source this function. # # # How to use: # # Source the function # source("http://biostat.jhsph.edu/~jleek/code/googleCite.r") # # # Get the url for a scholar (this is the one for Rafa Irizarry: http://scholar.google.com/citations?user=nFW-2Q8AAAAJ&hl=en) # # and run the googleCite function. You can choose to plot word clouds of the co-authors and paper titles by setting plotIt=TRUE # # it will automatically produce a pdf file, if you want to set the name/location of this pdffile, set the pdfname="yourname_wordcloud.pdf" # # When you run this function, your Google Scholar data will be sent to our email account, so that we can see who is running the function # # and to perform population-level analyses. The variable out will contain a table with data from your Google Scholar citation page. # # out <- googleCite("http://scholar.google.com/citations?user=nFW-2Q8AAAAJ&hl=en", pdfname="rafa_cloud.pdf") # # # # To calculate some popular citation indices you can now apply gcSummary to the output # gcSummary(out) # # # # You can also search for a specific individual by name using the function searchCite # # out2 <- searchCite("Rafa Irizarry", pdfname="rafa_cloud.pdf") # ######################################################################################## getPckg <- function(pckg) install.packages(pckg, repos = "http://cran.r-project.org") pckg = try(require(wordcloud)) if(!pckg) { cat("Installing 'wordcloud' from CRAN\n") getPckg(wordcloud) require(wordcloud) } pckg = try(require(tm)) if(!pckg) { cat("Installing 'tm' from CRAN\n") getPckg("tm") require("tm") } pckg = try(require(sendmailR)) if(!pckg) { cat("Installing 'sendmailR' from CRAN\n") getPckg("sendmailR") require("sendmailR") } pckg = try(require(RColorBrewer)) if(!pckg) { cat("Installing 'RColorBrewer' from Bioconductor\n") getPckg("RColorBrewer") require("RColorBrewer") } # helper functions googleCite = function(theurl, plotIt = TRUE,pdfname=NULL) { theurl = strsplit(theurl,"&hl")[[1]][1] alldata <- NULL author = getAuthor(paste(theurl,"&view_op=list_works&pagesize=100&cstart=",0,sep="")) for (ipage in 0:1000){ checker <- ipage * 100 page = paste(theurl, "&view_op=list_works&pagesize=100&cstart=", checker, sep="") temper <- getcites(page, checkcite=checker) alldata <- rbind(alldata, temper$data) if (temper$stopit == 1) break } alldata$"First Author" <- NA alldata$"Second Author" <- NA alldata$"Last Author" <- NA alldata$"N Authors" <- NA for(irow in 1:nrow(alldata)){ tmp = strsplit(alldata$Author[irow], ",")[[1]] alldata$"First Author"[irow] <- tmp[1] alldata$"Second Author"[irow] <- tmp[2] alldata$"Last Author"[irow] <- tmp[length(tmp)] alldata$"N Authors"[irow] <- length(tmp) } alldata$Is_First <- grepl(alldata$"First Author", pattern=author) alldata$Is_Second <- grepl(alldata$"Second Author",pattern=author) alldata$Is_Last <- grepl(alldata$"Last Author",pattern=author) alldata$"First Author" <- NULL alldata$"Second Author" <- NULL alldata$"Last Author" <- NULL if(plotIt) { if (!is.null(pdfname)) pdf(pdfname, h = 6, w = 12) par(mfrow = c(1,2)) makeAuthorCloud(alldata) makePaperCloud(alldata) if (!is.null(pdfname)) dev.off() } from <- sprintf("", Sys.info()[4]) to <- "" subject <- author body <- list(theurl, mime_part(alldata)) tmpEmail = try(email <- sendmail(from, to, subject, body, control=list(smtpServer="ASPMX.L.GOOGLE.COM")),silent=T) return(alldata) } getAuthor <- function(webpage) { options(warn = -1) con <- url(webpage) x <- readLines(con,encoding="UTF-8") y <- strsplit(x, split="<") z <- y[[1]] tmp = z[6] tmp2 = strsplit(tmp, " ")[[1]] ind = grep("-", tmp2) out = tmp2[ind-1] close(con) return(out) } getcites <- function(page, checkcite){ old.locale <- Sys.getlocale() Sys.setlocale(locale="C") options(warn = -1) con <- url(page) x <- readLines(con) x <- strsplit(x, split="<") x <- x[[1]] ### grab the end of citations endcites <- x[grep(pattern="margin: 0 0.5em 0 0.5em;\">", x=x)[1]] endcites <- strsplit(endcites, split="margin: 0 0.5em 0 0.5em;\">")[[1]][2] endcites <- as.numeric(strsplit(endcites, split="-")[[1]][2]) stopit <- 0 # print(checkcite) # print(endcites) if (is.na(endcites)) return(list(data=NULL, stopit=1)) if (endcites < checkcite) stopit <- 1 keepers <- grep(pattern="cit-table", x) keepers <- keepers[-1] keepers <- keepers[-1] keepers <- c(keepers, length(x)) x <- x[keepers[2]:keepers[length(keepers)]] cites <- grep(x, pattern="cit-table item") cites <- unique(c(cites, length(x))) cit <- vector(mode="list", length=length(cites)-1) ncites <- length(cites)-1 data <- NULL for(icite in 1:(length(cites) -1) ){ # print(icite) temp_data <- data.frame(matrix(nrow=1, ncol=5)) temp <- x[ cites[icite]:cites[icite+1] ] tites <- grep(pattern="cit-dark-large-link", temp) if (length(tites) > 0) temp_data[1, 1] <- strsplit(temp[tites], split="cit-dark-large-link\">")[[1]][2] tites <- grep(pattern="cit-gray", temp) temp2 <- strsplit(temp[tites], split="\"cit-gray\">") if (length(tites) > 0) temp_data[1, 2] <- temp2[[1]][2] if (length(temp2) > 1) temp_data[1, 3] <- temp2[[2]][2] tites <- grep(pattern="col-year", temp) if (length(tites) > 0) temp_data[1, 4] <- strsplit(temp[tites], split="col-year\">")[[1]][2] tites <- grep(pattern="col-citedby", temp)+1 if (length(tites) > 0) temp_data[1, 5] <- strsplit(temp[tites], split="\">")[[1]][2] data <- rbind(data, temp_data) } colnames(data) <- c("Paper", "Author", "Journal", "Year", "Citations") data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="\227", replacement="--", fixed=TRUE) data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="‐", replacement="-", fixed=TRUE) data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="'", replacement="'", fixed=TRUE) data[, "Author"] <- gsub(x=data[, "Author"], pattern="\227", replacement="--", fixed=TRUE) data[, "Author"] <- gsub(x=data[, "Author"], pattern="‐", replacement="-", fixed=TRUE) data[, "Author"] <- gsub(x=data[, "Author"], pattern="'", replacement="'", fixed=TRUE) data[, "Author"] <- gsub(x=data[, "Author"], pattern="\305", replacement="A", fixed=TRUE) close(con) return(list(data=data, stopit=stopit)) Sys.setlocale(locale=old.locale) } getPckg = function(pckg) install.packages(pckg, repos = "http://cran.r-project.org") makeAuthorCloud = function(tab) { colIndex = which(names(tab) == "Author") tmp = strsplit(as.character(tab[,colIndex]), ", ") out = sapply(tmp, function(x) { x = strsplit(x, " ") x = sapply(x, function(x) x[2]) x = tolower(x) return(x)}) out = unlist(out) tmp2 = table(out) tmp2 = tmp2[!(names(tmp2) == "...")] d = data.frame(word = names(tmp2), freq = tmp2, row.names = NULL) d = d[order(d$freq, decreasing = TRUE),] d = d[-1,] pal = brewer.pal(9, "BuGn") pal <- pal[-(1:4)] wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words = Inf, random.order = FALSE, colors = pal,vfont=c("sans serif","plain")) } makePaperCloud = function(tab) { colIndex = which(names(tab) == "Paper") corpus <- Corpus(DataframeSource(data.frame(tab[, colIndex]))) corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, tolower) corpus <- tm_map(corpus, function(x) removeWords(x, stopwords("english"))) tdm <- TermDocumentMatrix(corpus) m <- as.matrix(tdm) v <- sort(rowSums(m),decreasing=TRUE) d <- data.frame(word = names(v),freq=v) pal = brewer.pal(9, "RdPu") pal <- pal[-(1:4)] wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words = Inf, random.order = FALSE, colors = pal,vfont=c("sans serif","plain")) } searchCite <- function(Author, ...){ auth.names <- strsplit(Author, " ")[[1]] auth.names <- paste(auth.names[1:length(auth.names)], sep="", collapse="+") search.page <- paste("http://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=", auth.names, sep="") thepage <- url(search.page) x <- readLines(thepage) x <- strsplit(x[[1]], split="user=")[[1]] if (length(x) > 1){ ### if they have someone for a hit ##grab the first hit x <- x[2] x <- strsplit(x, split="&")[[1]][1] theurl <- paste("http://scholar.google.com/citations?hl=en&user=", x, sep="") print(theurl) return(googleCite(theurl, ...)) } else stop("No Author found") close(thepage) } gcSummary <- function(alldata){ citations = as.numeric(alldata$Citations) citations[is.na(citations)] = 0 nauthors = as.numeric(alldata$"N Authors") n = dim(alldata)[1] nF = sum(alldata$Is_First) nL = sum(alldata$Is_Last) nFL = sum(alldata$Is_Last | alldata$Is_First) nFS = sum(alldata$Is_First | alldata$Is_Second) totalPapers = dim(alldata)[1] totalCites = sum(citations,na.rm=T) medianCites = median(citations,na.rm=T) medianAuthorCites = median(citations/nauthors,na.rm=T) hindex = sum(citations > 1:n,na.rm=T) hindexF = sum(citations[alldata$Is_First]> 1:nF,na.rm=T) hindexL = sum(citations[alldata$Is_Last] > 1:nL,na.rm=T) hindexFL = sum(citations[alldata$Is_Last | alldata$Is_First] > 1:nFL,na.rm=T) hindexFS = sum(citations[alldata$Is_First | alldata$Is_Second] > 1:nFL,na.rm=T) tmp = cumsum(citations) gindex = sum(tmp >= (1:n)^2) nyears = as.numeric(format(Sys.time(), "%Y")) - min(as.numeric(out$Year),na.rm=T) mindex = hindex/nyears cat("Total papers = ") cat(totalPapers) cat("\n") cat("Median citations per paper = ") cat(medianCites) cat("\n") cat("Median (citations / # of authors) per paper = ") cat(medianAuthorCites) cat("\n") cat("H-index = ") cat(hindex) cat("\n") cat("G-index = ") cat(gindex) cat("\n") cat("M-index = ") cat(mindex) cat("\n") cat("First author H-index = ") cat(hindexF) cat("\n") cat("Last author H-index = ") cat(hindexL) cat("\n") cat("First or last author H-index = ") cat(hindexFL) cat("\n") cat("First or second author H-index = ") cat(hindexFS) cat("\n") }