Homework 11

Function Plotting

For this exercise I will be using an equation that shows how allelic frequencies over time change given a specific mutation rate. The initial frequency (p0) of the A allele is 0.90. The mutation rate (u) is 10^-03 . Finally time (t) represents the number of generations.

MutationChange <- function(p0=0.90,
                           u=10^-03,
                           t=1:10^5){ 
  q <- (1- (p0*exp(-u*t)))
  return(q)
} 


##########################################
MutationChangePlot <- function(p0=0.90,
                               u=10^-03,
                               t=1:10^3) {
  plot(x = t, y = MutationChange(p0,u,t),
       type = "l", 
       xlab = "Time", 
       ylab = "Change in The B Allele", 
       ylim = c(0,1))
  mtext(paste("u=", u, "p0=", p0), cex= 0.7)
  return()
}
MutationChangePlot()

## NULL

#########################################

# now build a grid of plots 
# each with different parameter values
# global variables 

p0Pars <- c(0.25, 0.65, 0.95)
uPars <- c(10^-01, 10^-02, 10^-03, 10^-04)
par(mfrow = c(3,4))

# enter into a double loop for plotting 

for(i in 1:length(p0Pars)){
  for(j in 1:length(uPars)){
    MutationChangePlot(p0=p0Pars[i], u=uPars[j])
  }
}

par(mfrow=c(1,1))
# Plotting redux with ggplot 
library(ggplot2)

p0Pars <- c(0.25, 0.65, 0.95)
uPars <- c(10^-01, 10^-02, 10^-03, 10^-04)
Time <- 1:10^3

modelFrame <- expand.grid(p0 = p0Pars, 
                          u = uPars, 
                          t = Time)

modelFrame

modelFrame$q <- NA

# tricky double for loop for filling new data frame 

for(i in 1:length(p0Pars)){
  for(j in 1:length(uPars)){
    modelFrame[modelFrame$p0==p0Pars[i] &  modelFrame$u == uPars[j], "q"] <- MutationChange(t=Time, p0= p0Pars[i], u=uPars[j])  # each combination of z parameter and c parameter - give me just those rows and the S column
  }
}

# Starts with cpars for every one changes with zpars specifics
# colS is the sumber of species given all the parameters

p1 <- ggplot(data= modelFrame)
p1 + geom_line(mapping=aes(x=t, y=q)) +
  facet_grid(p0~u)

p2 <- p1
p2 + geom_line(mapping = aes(x=t, y=q, group =u)) +
  facet_grid(.~p0)

Question 2 Randomization

# Preliminaries 
library(ggplot2)
library(TeachingDemos)
char2seed("Cruel April")

##########################################

# FUNCTION: readData 
# read in or generate data frame 
# input: file name (or nothin for demo)
# output: 3-column data frame of observed data (ID, x, y) 
# --------------------------------
readData <- function(z=NULL){
  if(is.null(z)){
    xVar <- c(iris$Sepal.Length)
    yVar <- c(iris$Sepal.Width)
    dF <- data.frame(ID=seq_along
                     (xVar), xVar, yVar)
    return(dF)}
}

########################################
#FUNCTION: getMetric 
# calculates metric for randomization test 
# input- 3- column data frame
# output: regression 

getMetric <- function(z=NULL){
  if(is.null(z)){
    xVar <- c(iris$Sepal.Length)
    yVar <- c(iris$Sepal.Width)
    z <- data.frame(ID=seq_along
                    (xVar), xVar, yVar)}
  . <- lm(z[,3]~z[,2])
  . <- summary(.)
  . <- .$coefficients[2,1]
  slope <- .
  return(slope)
}
getMetric()

## [1] -0.0618848

######################################

shuffleData <- function(z=NULL){
  if(is.null(z)){
    xVar <- c(iris$Sepal.Length)
    yVar <- c(iris$Sepal.Width)
    z <- data.frame(ID=seq_along
                    (xVar), xVar, yVar)}
  
  z[,3] <- sample(z[,3])
  return(z)
}
shuffleData()

#########################################

#Function: getPVAL
# calculate p value for observed, simulated data 
# input: list of observed metric and vector of simulated metric
# output: lower, upper tail probability vector

getPval <- function(z=NULL) {
  if(is.null(z)){
    z <- list(xObs=runif(1), xSim=runif(1000))}
  
  pLower <- mean(z[[2]] <=z[[1]])
  pUpper <- mean(z[[2]] >=z[[1]])
  
  return(c(pL=pLower, pU= pUpper))
  
}
getPval()

##    pL    pU 
## 0.504 0.496

#--------------------------------------- Main Body 

nSim <- 10000 # number of simulations
Xsim <- rep(NA, nSim) #will hold simulated slopes

dF <- readData()
Xobs <- getMetric(dF)

for (i in seq_len(nSim)) {
  Xsim[i] <- getMetric(shuffleData(dF)) }

slopes <- list(Xobs,Xsim)
getPval(slopes)

##     pL     pU 
## 0.0759 0.9241

#------------------------------------------
## FUNCTION: plotRanTest 
## ggplot graph 
## input : list of observed metric and vector of the simulated metric 
## output: ggplot graph 

plotRantTest <- function(z=NULL){
  if(is.null(z)){
    z <- list(xObs=runif(1), xSim=runif(1000))}
  df <- data.frame(ID= seq_along(z[[2]]), 
                   simX=z[[2]])
  p1 <- ggplot(data=df, mapping=aes(x=simX))
  p1 + geom_histogram(mapping=aes(fill=I("mistyrose1"), color=I("black"))) +
    geom_vline(aes(xintercept=z[[1]], col="blue"))
}
plotRantTest()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Question 3: For comparison, calculate in R the standard statistical analysis you would use with these data. How does the p-value compare for the standard test versus the p value you estimated from your randomization test? If the p values seem very different, run the program again with a different starting seed (and/or increase the number of replications in your randomization test). If there are persistent differences in the p value of the standard test versus your randomization, what do you think is responsible for this difference?

In both the observed the observed and randomized data sets the p-value is the same. This indicates that there may be an error in the function itself. We would expect to see different p-values due to the randomization that occurs.

lmiris <- lm(iris$Sepal.Length ~ iris$Sepal.Width, data = readData())
summary(lmiris)

## 
## Call:
## lm(formula = iris$Sepal.Length ~ iris$Sepal.Width, data = readData())
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5561 -0.6333 -0.1120  0.5579  2.2226 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.5262     0.4789   13.63   <2e-16 ***
## iris$Sepal.Width  -0.2234     0.1551   -1.44    0.152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8251 on 148 degrees of freedom
## Multiple R-squared:  0.01382,    Adjusted R-squared:  0.007159 
## F-statistic: 2.074 on 1 and 148 DF,  p-value: 0.1519

lmirisShuffle <- lm(iris$Sepal.Length ~ iris$Sepal.Width, data = shuffleData())
summary(lmirisShuffle)

## 
## Call:
## lm(formula = iris$Sepal.Length ~ iris$Sepal.Width, data = shuffleData())
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5561 -0.6333 -0.1120  0.5579  2.2226 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.5262     0.4789   13.63   <2e-16 ***
## iris$Sepal.Width  -0.2234     0.1551   -1.44    0.152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8251 on 148 degrees of freedom
## Multiple R-squared:  0.01382,    Adjusted R-squared:  0.007159 
## F-statistic: 2.074 on 1 and 148 DF,  p-value: 0.1519

Homework 11

Luke Daniels

4/4/2018