############################################################################
############################################################################
##                                                                        ##
##   File answers.txt: some solutions to the R-warmup session exercises   ##
##                                                                        ##
############################################################################
############################################################################


### SECTION I don't know R
### ======================


## exercise 1
## ----------

data(iris)
mean(iris[,4]) # but mean(iris$Petal.width) and mean(iris[,"Petal.width"]) are better

# summary() does not compute relative dispersion aka coefficient of variation or relative inter quartile range.
# summary() does not plot anything.

median(iris[,4]) # but median(iris$Petal.width) and median(iris[,"Petal.width"]) are better

# R computes but does not give any advice. You have to know things to ask R to do it.
# R does not show the unit. You must run help(iris) to learn that the  the measurements are in centimeters

## exercise 2
## ----------

source("http://www.info.univ-angers.fr/~gh/statgh.r",encoding="latin1")
elf      <- lit.dar("http://www.info.univ-angers.fr/~gh/Datasets/elf.dar")

# another solution

elf <- read.table("http://www.info.univ-angers.fr/~gh/Datasets/elf.dar",head=TRUE,row.names=1)

# factor conversion and frequency table computations

elf$SEXE <- factor(elf$SEXE,levels=0:1,labels=c("male","female"))
cnts     <- table(elf$SEXE)
resTab   <- data.frame(rbind(cnts, 100*prop.table(cnts ) ))
row.names(resTab)[2] <- "pcts (%)"
print(resTab)

# sort the counts with

sort(cnts,decreasing = TRUE)

## exercise 3
## ----------

library(ape)
geneX <- read.GenBank("X94991.1",as.character=TRUE)
cnt <- table(geneX)
cat( round(100*(cnt[2]+cnt[3])/sum(cnt)),"%",sep="" )

# GC.content(geneX) is not OK since geneX is not an object of class "DNAbin",
# but it works after conversion:

class(geneX)
geneX.dnab <- as.DNAbin( geneX )
GC.content( as.DNAbin(geneX) )

## exercise 4
## ----------

t.test(AGE ~ SEXE,data=elf)

# there is no significant difference between women and men as far as age is concerned.

# no, R does not check any thing (let's says normality or sample sizes) before computing a test. it is your responsability to do so (and to know what to do !).

## exercise 5
## ----------

# msaR is a package that does multiple sequences alignments
# and also an htmlwidgets wrapper of the BioJS MSA viewer javascript library.
# see https://zachcp.github.io/msaR/

browseURL("https://zachcp.github.io/msaR/")

# AlignStats is also a package that does multiple sequences alignments plus a Web site to use it.
# see https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-1300-6

browseURL("https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-1300-6")

## exercise 6
## ----------

# boxplots do not tell if the distribution is unimodal. beanplots (and violin plots) do.
# see http://www.info.univ-angers.fr/~gh/wstat/Introduction_R/intror3.php?solutions=1#tdm6

browseURL("http://www.info.univ-angers.fr/~gh/wstat/Introduction_R/intror3.php?solutions=1#tdm6")

## exercise 7
## ----------

# the rms package depends on other packages.
# https://cran.r-project.org/web/packages/rms/rms.pdf
# so before installing the rms package, R installs the required packages. this it why it takes so long.
# this will show you the rms vignette (pdf file) in your Internet browser:

browseURL("https://cran.r-project.org/web/packages/rms/rms.pdf")

# install the rms package in Rstudio is the easiest way to do so
# because it takes care of the url of the archive to download from, it check the dependencies and so on.

library("rms")
data(package="rms")

# there is no datasets in this package, only functions

# associated springer book :
# https://www.springer.com/gb/book/9781441929181
# see also http://biostat.mc.vanderbilt.edu/tmp/course.pdf -- free copy of the book

browseURL("https://www.springer.com/gb/book/9781441929181")
browseURL("http://biostat.mc.vanderbilt.edu/tmp/course.pdf")

## exercise 8
## ----------

# install the faraway package in Rstudio is the easiest way to do so

library("faraway")
data(package="faraway")

# associated crc book :
# https://www.crcpress.com/Linear-Models-with-R-Second-Edition/Faraway/p/book/9781439887332
# https://www.crcpress.com/Extending-the-Linear-Model-with-R-Generalized-Linear-Mixed-Effects-and/Faraway/p/book/9781498720960

data(diabetes)
diabetes2 <- na.omit(diabetes)

# this is not a good idea because it will remove tow many lines:
# diabetes has 403 lines and diabetes2 only 130
# it is better to select the variables and then to remove the NA values for the selected variables.

dim(diabetes) # but nrow(diabetes) is enough
dim(diabetes2)

## exercise 9
## ----------

# install the survival package in Rstudio is the easiest way to do so

library("survival")
data(package="survival")

class(leukemia)
dim(leukemia)

class(kidney)
dim(kidney)

# for all data, use my ddata() function with two "d" at the beginning of the name of the function
# http://forge.info.univ-angers.fr/~gh/wstat/statghfns.php?lafns=ddata

# not run :
#  source("http://forge.info.univ-angers.fr/~gh/wstat/statgh.r",encoding="latin1")
#  ddata("survival")

## exercise 10
## ----------

# usually R has already functions to deal with lines and columns, such as colMeans(), apply(), tapply()...
# if you write your own functions, yoy may not be as efficient as R is, and it may take a lot of time and memory.

### SECTION I think I know R for statistics and bioinformatics
### ==========================================================

# load all the needed datasets

data(cars)
data(iris)
diab <- read.table("http://forge.info.univ-angers.fr/~gh/wstat/Eda/diabetes.dar",head=TRUE,row.names=1)

# exercise 1
## ----------

print( colMeans( iris[,(1:4)] ))
apply(FUN=summary,X=iris[,-5],MARGIN=2)
tapply(FUN=summary,X=iris[,1],INDEX=iris$Species)

## exercise 2
## ----------

row.names(cars) <- paste("Car",sprintf("%03d",1:nrow(cars)),sep="")
print( head(cars,n=10) )

## exercise 3
## ----------

print( cbind(names(iris)), quote=FALSE)

## exercise 4
## ----------

flt   <- is.na(diab$bp.1s)
diab2 <- diab[ !flt, ]
nrow(diab)
nrow(diab2)

naCols <- data.frame(cols=names(diab2),nbNA=apply(X=diab2,M=2,FUN = function(x) { return(sum(is.na(x))) } ))
print(naCols[ order(naCols$nbNA,decreasing=TRUE),])

# so bp.2s and bp.2s have both 257 NA values

## exercise 5
## ----------

diab <- transform(diab,ageCL=ifelse(age<18,"young","old") )

# mutate() can transform "on the fly" variables. tranform() can not.

## exercise 6
## ----------

values  <- rbinom(n=10**6,size = 5,prob=0.3)
maxVal  <- max(values)
occurs  <- which(values==maxVal)
cat("max is",maxVal," it occurs",length(occurs),"times. first positions: ",head(occurs),"\n")

# the bad way:
cat("max is",max(values)," it occurs",length(which(values==max(values))),"times. first positions: ",head(which(values==max(values))),"\n")

# you may use system.time( expression ) to evaluate the duration
# but the microbenchmark() function from the eponym package is a better solution
# first expression is roughly 13 times faster than the second one

## exercise 7
## ----------

# see our decritQLf() function at http://forge.info.univ-angers.fr/~gh/wstat/statghfns.php?lafns=decritQLf

## exercise 8
## ----------

# see http://forge.info.univ-angers.fr/~gh/wstat/Introduction_R/intror3.php?solutions=1#tdm2
# in short, use our decritQT() function at http://forge.info.univ-angers.fr/~gh/wstat/statghfns.php?lafns=decritQT
# basically, this is what you do:

source("http://www.info.univ-angers.fr/~gh/statgh.r",encoding="latin1")
lea <- lit.dar("http://www.info.univ-angers.fr/~gh/Datasets/lea.dar")
lng <- lea$length

hist(lng,col="lightblue",main="",probability=TRUE,ylim=c(0,0.006),xlab="",ylab="")      # histogram (adapted on Y-axis)
vnorm <- function(x) { return( dnorm(x,mean=mean(lng),sd=sd(lng)) ) }                   # normal candidate
curve(vnorm,add=TRUE,col="red",lwd=2)                                                   # curve of the normal candidate
dns <- density(lng)                                                                     # kernel density estimation
lines(dns,col="blue")                                                                   # plot of this density
rug(lng)                                                                                # "peigne" of density on X-axis
legend(x="topright",c("density","loi normale"),col=c("blue","red"),lty=c(1,1),bty="y")  # legend is usefull here
titre <- paste("Histogramme des",length(lng),"longueurs")                               # in French : le titre et les labels des axes
title(main=titre,xlab="length of sequences",ylab="density")                             # that's all, folks!

## exercise 9
## ----------

# this is a debate too long to write it here; use internet to find some answers:

browseURL("https://www.google.com/search?q=ggplot+versus+lattice")

## exercise 10
## ----------

# there are some good tutorials and even good videos for shiny and r jupyter notebooks.
# it is easy to find them on internet, for instance:

browseURL("https://www.datacamp.com/community/blog/jupyter-notebook-r")
browseURL("https://shiny.rstudio.com/tutorial/")

### SECTION I think I know how to program in R
### ==========================================


## exercise 1
## ----------

apply(X=iris[,-5],MARGIN=2,FUN=function(x) { tapply(FUN=summary,X=x,INDEX=iris$Species) } )


## exercise 2
## ----------

# first define a copies function for strings using rep()

.copies <- function(string,nrep=0) { # renvoie n copies de la chaine string
  res <- ""
  if (nrep>0) { res <- paste(rep(string,nrep),collapse="") }
  return(res)
} # fin de fonction .copies

# then use it

cats <- function(chen="",soulign="=") {
  cat("\n")
  cat(chen,"\n")
  cat(.copies(soulign,nchar(chen)),"\n")
  cat("\n")
} # fin de fonction cats

cats("first part")
cats("second part","-")

## exercise 3
## ----------

timer <- function(...) { # on utilise le package lubridate car plus "sympa" que Sys.time()
  library(lubridate)
  tempsDeb <- now()
  eval.parent(...)
  tempsFin <- now()
  cat("duration between ",format(tempsDeb,"%c")," and ",format(tempsFin,"%c")," = ",tempsFin-tempsDeb,"\n")
  return(tempsFin-tempsDeb)
} # fin de fonction timer


## exercise 4
## ----------

extractPvalue <- function( a.t.test ) { return(a.t.test$p.value) }

extractPvalues <- function( a.data.frame, a.column.name ) {
  colNum <- which(names(a.data.frame)==a.column.name)
  apply(X =a.data.frame[,-colNum],M=2,F=function(x) { return( extractPvalue(t.test(x~a.data.frame[,a.column.name]) ) ) })
} # end of function extractPvalues

iris2          <- iris[ (!iris$Species=="setosa"), ]
iris2$Species2 <- factor(iris2$Species)
iris2$Species  <- NULL # this removes the Species column

extractPvalue( t.test(iris2$Sepal.Length ~ iris2$Species2) )
cbind(extractPvalues( iris2, "Species2" ))


## exercise 5
## ----------

# following example(boxplot) :

boxplot2 <- function( myData, myConditions, varColors,...) {

  boxplot(len ~ dose, data = myData,
          boxwex = 0.25, at = 1:3 - 0.2,
          subset = supp == "VC", col = varColors[1],
          xlim = c(0.5, 3.5), ylim = c(0, 35), yaxs = "i",...)

  boxplot(len ~ dose, data=ToothGrowth, add=TRUE,boxwex = 0.25, at = 1:3 + 0.2, subset = supp == "OJ", col = varColors[2])

  legend(2, 9, myConditions,fill = varColors)

} # fin de fonction boxplot2

boxplot3(
  myData=ToothGrowth,
  myConditions=c("Ascorbic acid", "Orange juice"),
  varColors = c("yellow", "orange"),
  main = "Guinea Pigs' Tooth Growth",
  xlab = "Vitamin C dose mg",
  ylab = "tooth length"
) # fin de boxplot3


## exercise 6
## ----------

# paramètres de la loi binomiale

  n <- 4    # 4 enfants par famille
  p <- 0.5  # hypothèse d'équiprobabilité H/F

# fréquences théoriques

  fth  <- dbinom( 0:n, n, p)

# effectifs observés

  eobs <- c(18,55,21,12,4)

# effectifs théoriques

  eth  <- fth*sum( eobs )

# différences

  dif <- eobs-eth

# carré des différences

  cdif <- dif*dif

# carré pondéré des différences (=contribution)

  cpdif <- cdif/eth

# cumul des carrés pondérés des différences

  scpd <- cumsum( cpdif)

# test du Chi-deux

  testc <- chisq.test( eobs, p=fth)

# restructuration et affichage des données

resTab            <- rbind( fth, eth, eobs,dif,cdif,cpdif,scpd )
row.names(resTab) <- c("Fréquences théoriques",
                     "Effectifs théoriques",
                     "Effectifs observés",     # R allows you to write comments inside instructions
                     "Différences",            # nice, isn't it?
                     "Carrés des diff.",
                     "Contributions",          # the most important data is this  column
                     "Somme des ctr.")
colnames(resTab)  <- 0:n

cat("Results\n")
print(resTab)

# résultats du test

print( testc )

# le graphique en barres s'obtient par

barplot(rbind(eobs,eth),beside=TRUE)


## exercise 7
## ----------

# if you are not able to program the recursive solution, follow the
# one from the sets package

library(sets)

allSets <- function(n) {
   nbSets <- 0
   nbAll  <- 2**n
   cat( sprintf("%3d",nbSets),"/",nbAll,"empty set\n")
   for (ind in (1:n)) {
       allEns <- set_combn(letters[1:n],ind)
       for (s in allEns) {
         nbSets <- nbSets + 1
         cat( sprintf("%3d",nbSets),"/",nbAll,as.character(s),"\n")
       } ; # fin pour s
   } ; # fin pour ind
} # end of function allSets

allSets(3)

## exercise 8
## ----------

.library <- function() {
   options(warn=-1)
   options(internet.info=100)
   suppressMessages(library(gdata,quietly=TRUE,verbose=FALSE,warn.conflicts=FALSE))
} # fin de fonction .library

.library()

# with a dot at the beginning R does not show the function in R studio's environment pane

## exercise 9
## ----------

# more information here: https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
# read my book at Dunod to know the answer for R. the method is called suffix tree.

# these instructions are NOT the solution

library(Biostrings)
library(Rlibstree)
library(stringi)

fn     <- "putida_gb_1.fasta"
dnaSeq <- readDNAStringSet(fn)
seq    <- toString(dnaSeq[1])
lrsStr <- getLongestRepeatedSubstring(seq)
nbc    <- nchar(lrsStr)
lrsPos <- stri_locate_all(seq,fixed=lrsStr)

cat("the length of the longest repeated substring of ",fn," is: ",nbc,"\n")
cat("positions of this substring are:\n")
print(lrsPos)


## exercise 10
## ----------

# there is still a controverse about the three systems for defining objects in R. have a look on the internet to know what it is about.

browseURL("https://stackoverflow.com/questions/27219132/creating-classes-in-r-s3-s4-r5-rc-or-r6")

# tidyverse is a "modern" way of using R, promoted by Hadley WICKAM and others. have a look at it and decide if it is worth the effort.

sbrowseURL("https://www.tidyverse.org/learn/")