############################################################################ ############################################################################ ## ## ## File answers.txt: some solutions to the R-warmup session exercises ## ## ## ############################################################################ ############################################################################ ### SECTION I don't know R ### ====================== ## exercise 1 ## ---------- data(iris) mean(iris[,4]) # but mean(iris$Petal.width) and mean(iris[,"Petal.width"]) are better # summary() does not compute relative dispersion aka coefficient of variation or relative inter quartile range. # summary() does not plot anything. median(iris[,4]) # but median(iris$Petal.width) and median(iris[,"Petal.width"]) are better # R computes but does not give any advice. You have to know things to ask R to do it. # R does not show the unit. You must run help(iris) to learn that the the measurements are in centimeters ## exercise 2 ## ---------- source("http://www.info.univ-angers.fr/~gh/statgh.r",encoding="latin1") elf <- lit.dar("http://www.info.univ-angers.fr/~gh/Datasets/elf.dar") # another solution elf <- read.table("http://www.info.univ-angers.fr/~gh/Datasets/elf.dar",head=TRUE,row.names=1) # factor conversion and frequency table computations elf$SEXE <- factor(elf$SEXE,levels=0:1,labels=c("male","female")) cnts <- table(elf$SEXE) resTab <- data.frame(rbind(cnts, 100*prop.table(cnts ) )) row.names(resTab)[2] <- "pcts (%)" print(resTab) # sort the counts with sort(cnts,decreasing = TRUE) ## exercise 3 ## ---------- library(ape) geneX <- read.GenBank("X94991.1",as.character=TRUE) cnt <- table(geneX) cat( round(100*(cnt[2]+cnt[3])/sum(cnt)),"%",sep="" ) # GC.content(geneX) is not OK since geneX is not an object of class "DNAbin", # but it works after conversion: class(geneX) geneX.dnab <- as.DNAbin( geneX ) GC.content( as.DNAbin(geneX) ) ## exercise 4 ## ---------- t.test(AGE ~ SEXE,data=elf) # there is no significant difference between women and men as far as age is concerned. # no, R does not check any thing (let's says normality or sample sizes) before computing a test. it is your responsability to do so (and to know what to do !). ## exercise 5 ## ---------- # msaR is a package that does multiple sequences alignments # and also an htmlwidgets wrapper of the BioJS MSA viewer javascript library. # see https://zachcp.github.io/msaR/ browseURL("https://zachcp.github.io/msaR/") # AlignStats is also a package that does multiple sequences alignments plus a Web site to use it. # see https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-1300-6 browseURL("https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-1300-6") ## exercise 6 ## ---------- # boxplots do not tell if the distribution is unimodal. beanplots (and violin plots) do. # see http://www.info.univ-angers.fr/~gh/wstat/Introduction_R/intror3.php?solutions=1#tdm6 browseURL("http://www.info.univ-angers.fr/~gh/wstat/Introduction_R/intror3.php?solutions=1#tdm6") ## exercise 7 ## ---------- # the rms package depends on other packages. # https://cran.r-project.org/web/packages/rms/rms.pdf # so before installing the rms package, R installs the required packages. this it why it takes so long. # this will show you the rms vignette (pdf file) in your Internet browser: browseURL("https://cran.r-project.org/web/packages/rms/rms.pdf") # install the rms package in Rstudio is the easiest way to do so # because it takes care of the url of the archive to download from, it check the dependencies and so on. library("rms") data(package="rms") # there is no datasets in this package, only functions # associated springer book : # https://www.springer.com/gb/book/9781441929181 # see also http://biostat.mc.vanderbilt.edu/tmp/course.pdf -- free copy of the book browseURL("https://www.springer.com/gb/book/9781441929181") browseURL("http://biostat.mc.vanderbilt.edu/tmp/course.pdf") ## exercise 8 ## ---------- # install the faraway package in Rstudio is the easiest way to do so library("faraway") data(package="faraway") # associated crc book : # https://www.crcpress.com/Linear-Models-with-R-Second-Edition/Faraway/p/book/9781439887332 # https://www.crcpress.com/Extending-the-Linear-Model-with-R-Generalized-Linear-Mixed-Effects-and/Faraway/p/book/9781498720960 data(diabetes) diabetes2 <- na.omit(diabetes) # this is not a good idea because it will remove tow many lines: # diabetes has 403 lines and diabetes2 only 130 # it is better to select the variables and then to remove the NA values for the selected variables. dim(diabetes) # but nrow(diabetes) is enough dim(diabetes2) ## exercise 9 ## ---------- # install the survival package in Rstudio is the easiest way to do so library("survival") data(package="survival") class(leukemia) dim(leukemia) class(kidney) dim(kidney) # for all data, use my ddata() function with two "d" at the beginning of the name of the function # http://forge.info.univ-angers.fr/~gh/wstat/statghfns.php?lafns=ddata # not run : # source("http://forge.info.univ-angers.fr/~gh/wstat/statgh.r",encoding="latin1") # ddata("survival") ## exercise 10 ## ---------- # usually R has already functions to deal with lines and columns, such as colMeans(), apply(), tapply()... # if you write your own functions, yoy may not be as efficient as R is, and it may take a lot of time and memory. ### SECTION I think I know R for statistics and bioinformatics ### ========================================================== # load all the needed datasets data(cars) data(iris) diab <- read.table("http://forge.info.univ-angers.fr/~gh/wstat/Eda/diabetes.dar",head=TRUE,row.names=1) # exercise 1 ## ---------- print( colMeans( iris[,(1:4)] )) apply(FUN=summary,X=iris[,-5],MARGIN=2) tapply(FUN=summary,X=iris[,1],INDEX=iris$Species) ## exercise 2 ## ---------- row.names(cars) <- paste("Car",sprintf("%03d",1:nrow(cars)),sep="") print( head(cars,n=10) ) ## exercise 3 ## ---------- print( cbind(names(iris)), quote=FALSE) ## exercise 4 ## ---------- flt <- is.na(diab$bp.1s) diab2 <- diab[ !flt, ] nrow(diab) nrow(diab2) naCols <- data.frame(cols=names(diab2),nbNA=apply(X=diab2,M=2,FUN = function(x) { return(sum(is.na(x))) } )) print(naCols[ order(naCols$nbNA,decreasing=TRUE),]) # so bp.2s and bp.2s have both 257 NA values ## exercise 5 ## ---------- diab <- transform(diab,ageCL=ifelse(age<18,"young","old") ) # mutate() can transform "on the fly" variables. tranform() can not. ## exercise 6 ## ---------- values <- rbinom(n=10**6,size = 5,prob=0.3) maxVal <- max(values) occurs <- which(values==maxVal) cat("max is",maxVal," it occurs",length(occurs),"times. first positions: ",head(occurs),"\n") # the bad way: cat("max is",max(values)," it occurs",length(which(values==max(values))),"times. first positions: ",head(which(values==max(values))),"\n") # you may use system.time( expression ) to evaluate the duration # but the microbenchmark() function from the eponym package is a better solution # first expression is roughly 13 times faster than the second one ## exercise 7 ## ---------- # see our decritQLf() function at http://forge.info.univ-angers.fr/~gh/wstat/statghfns.php?lafns=decritQLf ## exercise 8 ## ---------- # see http://forge.info.univ-angers.fr/~gh/wstat/Introduction_R/intror3.php?solutions=1#tdm2 # in short, use our decritQT() function at http://forge.info.univ-angers.fr/~gh/wstat/statghfns.php?lafns=decritQT # basically, this is what you do: source("http://www.info.univ-angers.fr/~gh/statgh.r",encoding="latin1") lea <- lit.dar("http://www.info.univ-angers.fr/~gh/Datasets/lea.dar") lng <- lea$length hist(lng,col="lightblue",main="",probability=TRUE,ylim=c(0,0.006),xlab="",ylab="") # histogram (adapted on Y-axis) vnorm <- function(x) { return( dnorm(x,mean=mean(lng),sd=sd(lng)) ) } # normal candidate curve(vnorm,add=TRUE,col="red",lwd=2) # curve of the normal candidate dns <- density(lng) # kernel density estimation lines(dns,col="blue") # plot of this density rug(lng) # "peigne" of density on X-axis legend(x="topright",c("density","loi normale"),col=c("blue","red"),lty=c(1,1),bty="y") # legend is usefull here titre <- paste("Histogramme des",length(lng),"longueurs") # in French : le titre et les labels des axes title(main=titre,xlab="length of sequences",ylab="density") # that's all, folks! ## exercise 9 ## ---------- # this is a debate too long to write it here; use internet to find some answers: browseURL("https://www.google.com/search?q=ggplot+versus+lattice") ## exercise 10 ## ---------- # there are some good tutorials and even good videos for shiny and r jupyter notebooks. # it is easy to find them on internet, for instance: browseURL("https://www.datacamp.com/community/blog/jupyter-notebook-r") browseURL("https://shiny.rstudio.com/tutorial/") ### SECTION I think I know how to program in R ### ========================================== ## exercise 1 ## ---------- apply(X=iris[,-5],MARGIN=2,FUN=function(x) { tapply(FUN=summary,X=x,INDEX=iris$Species) } ) ## exercise 2 ## ---------- # first define a copies function for strings using rep() .copies <- function(string,nrep=0) { # renvoie n copies de la chaine string res <- "" if (nrep>0) { res <- paste(rep(string,nrep),collapse="") } return(res) } # fin de fonction .copies # then use it cats <- function(chen="",soulign="=") { cat("\n") cat(chen,"\n") cat(.copies(soulign,nchar(chen)),"\n") cat("\n") } # fin de fonction cats cats("first part") cats("second part","-") ## exercise 3 ## ---------- timer <- function(...) { # on utilise le package lubridate car plus "sympa" que Sys.time() library(lubridate) tempsDeb <- now() eval.parent(...) tempsFin <- now() cat("duration between ",format(tempsDeb,"%c")," and ",format(tempsFin,"%c")," = ",tempsFin-tempsDeb,"\n") return(tempsFin-tempsDeb) } # fin de fonction timer ## exercise 4 ## ---------- extractPvalue <- function( a.t.test ) { return(a.t.test$p.value) } extractPvalues <- function( a.data.frame, a.column.name ) { colNum <- which(names(a.data.frame)==a.column.name) apply(X =a.data.frame[,-colNum],M=2,F=function(x) { return( extractPvalue(t.test(x~a.data.frame[,a.column.name]) ) ) }) } # end of function extractPvalues iris2 <- iris[ (!iris$Species=="setosa"), ] iris2$Species2 <- factor(iris2$Species) iris2$Species <- NULL # this removes the Species column extractPvalue( t.test(iris2$Sepal.Length ~ iris2$Species2) ) cbind(extractPvalues( iris2, "Species2" )) ## exercise 5 ## ---------- # following example(boxplot) : boxplot2 <- function( myData, myConditions, varColors,...) { boxplot(len ~ dose, data = myData, boxwex = 0.25, at = 1:3 - 0.2, subset = supp == "VC", col = varColors[1], xlim = c(0.5, 3.5), ylim = c(0, 35), yaxs = "i",...) boxplot(len ~ dose, data=ToothGrowth, add=TRUE,boxwex = 0.25, at = 1:3 + 0.2, subset = supp == "OJ", col = varColors[2]) legend(2, 9, myConditions,fill = varColors) } # fin de fonction boxplot2 boxplot3( myData=ToothGrowth, myConditions=c("Ascorbic acid", "Orange juice"), varColors = c("yellow", "orange"), main = "Guinea Pigs' Tooth Growth", xlab = "Vitamin C dose mg", ylab = "tooth length" ) # fin de boxplot3 ## exercise 6 ## ---------- # paramètres de la loi binomiale n <- 4 # 4 enfants par famille p <- 0.5 # hypothèse d'équiprobabilité H/F # fréquences théoriques fth <- dbinom( 0:n, n, p) # effectifs observés eobs <- c(18,55,21,12,4) # effectifs théoriques eth <- fth*sum( eobs ) # différences dif <- eobs-eth # carré des différences cdif <- dif*dif # carré pondéré des différences (=contribution) cpdif <- cdif/eth # cumul des carrés pondérés des différences scpd <- cumsum( cpdif) # test du Chi-deux testc <- chisq.test( eobs, p=fth) # restructuration et affichage des données resTab <- rbind( fth, eth, eobs,dif,cdif,cpdif,scpd ) row.names(resTab) <- c("Fréquences théoriques", "Effectifs théoriques", "Effectifs observés", # R allows you to write comments inside instructions "Différences", # nice, isn't it? "Carrés des diff.", "Contributions", # the most important data is this column "Somme des ctr.") colnames(resTab) <- 0:n cat("Results\n") print(resTab) # résultats du test print( testc ) # le graphique en barres s'obtient par barplot(rbind(eobs,eth),beside=TRUE) ## exercise 7 ## ---------- # if you are not able to program the recursive solution, follow the # one from the sets package library(sets) allSets <- function(n) { nbSets <- 0 nbAll <- 2**n cat( sprintf("%3d",nbSets),"/",nbAll,"empty set\n") for (ind in (1:n)) { allEns <- set_combn(letters[1:n],ind) for (s in allEns) { nbSets <- nbSets + 1 cat( sprintf("%3d",nbSets),"/",nbAll,as.character(s),"\n") } ; # fin pour s } ; # fin pour ind } # end of function allSets allSets(3) ## exercise 8 ## ---------- .library <- function() { options(warn=-1) options(internet.info=100) suppressMessages(library(gdata,quietly=TRUE,verbose=FALSE,warn.conflicts=FALSE)) } # fin de fonction .library .library() # with a dot at the beginning R does not show the function in R studio's environment pane ## exercise 9 ## ---------- # more information here: https://en.wikipedia.org/wiki/Longest_common_subsequence_problem # read my book at Dunod to know the answer for R. the method is called suffix tree. # these instructions are NOT the solution library(Biostrings) library(Rlibstree) library(stringi) fn <- "putida_gb_1.fasta" dnaSeq <- readDNAStringSet(fn) seq <- toString(dnaSeq[1]) lrsStr <- getLongestRepeatedSubstring(seq) nbc <- nchar(lrsStr) lrsPos <- stri_locate_all(seq,fixed=lrsStr) cat("the length of the longest repeated substring of ",fn," is: ",nbc,"\n") cat("positions of this substring are:\n") print(lrsPos) ## exercise 10 ## ---------- # there is still a controverse about the three systems for defining objects in R. have a look on the internet to know what it is about. browseURL("https://stackoverflow.com/questions/27219132/creating-classes-in-r-s3-s4-r5-rc-or-r6") # tidyverse is a "modern" way of using R, promoted by Hadley WICKAM and others. have a look at it and decide if it is worth the effort. sbrowseURL("https://www.tidyverse.org/learn/")