Function Plotting
For this exercise I will be using an equation that shows how allelic frequencies over time change given a specific mutation rate. The initial frequency (p0) of the A allele is 0.90. The mutation rate (u) is 10-03 . Finally time (t) represents the number of generations.
MutationChange <- function(p0=0.90,
u=10^-03,
t=1:10^5){
q <- (1- (p0*exp(-u*t)))
return(q)
}
##########################################
MutationChangePlot <- function(p0=0.90,
u=10^-03,
t=1:10^3) {
plot(x = t, y = MutationChange(p0,u,t),
type = "l",
xlab = "Time",
ylab = "Change in The B Allele",
ylim = c(0,1))
mtext(paste("u=", u, "p0=", p0), cex= 0.7)
return()
}
MutationChangePlot()
## NULL
#########################################
# now build a grid of plots
# each with different parameter values
# global variables
p0Pars <- c(0.25, 0.65, 0.95)
uPars <- c(10^-01, 10^-02, 10^-03, 10^-04)
par(mfrow = c(3,4))
# enter into a double loop for plotting
for(i in 1:length(p0Pars)){
for(j in 1:length(uPars)){
MutationChangePlot(p0=p0Pars[i], u=uPars[j])
}
}
par(mfrow=c(1,1))
# Plotting redux with ggplot
library(ggplot2)
p0Pars <- c(0.25, 0.65, 0.95)
uPars <- c(10^-01, 10^-02, 10^-03, 10^-04)
Time <- 1:10^3
modelFrame <- expand.grid(p0 = p0Pars,
u = uPars,
t = Time)
modelFrame
modelFrame$q <- NA
# tricky double for loop for filling new data frame
for(i in 1:length(p0Pars)){
for(j in 1:length(uPars)){
modelFrame[modelFrame$p0==p0Pars[i] & modelFrame$u == uPars[j], "q"] <- MutationChange(t=Time, p0= p0Pars[i], u=uPars[j]) # each combination of z parameter and c parameter - give me just those rows and the S column
}
}
# Starts with cpars for every one changes with zpars specifics
# colS is the sumber of species given all the parameters
p1 <- ggplot(data= modelFrame)
p1 + geom_line(mapping=aes(x=t, y=q)) +
facet_grid(p0~u)
p2 <- p1
p2 + geom_line(mapping = aes(x=t, y=q, group =u)) +
facet_grid(.~p0)
Question 2 Randomization
# Preliminaries
library(ggplot2)
library(TeachingDemos)
char2seed("Cruel April")
##########################################
# FUNCTION: readData
# read in or generate data frame
# input: file name (or nothin for demo)
# output: 3-column data frame of observed data (ID, x, y)
# --------------------------------
readData <- function(z=NULL){
if(is.null(z)){
xVar <- c(iris$Sepal.Length)
yVar <- c(iris$Sepal.Width)
dF <- data.frame(ID=seq_along
(xVar), xVar, yVar)
return(dF)}
}
########################################
#FUNCTION: getMetric
# calculates metric for randomization test
# input- 3- column data frame
# output: regression
getMetric <- function(z=NULL){
if(is.null(z)){
xVar <- c(iris$Sepal.Length)
yVar <- c(iris$Sepal.Width)
z <- data.frame(ID=seq_along
(xVar), xVar, yVar)}
. <- lm(z[,3]~z[,2])
. <- summary(.)
. <- .$coefficients[2,1]
slope <- .
return(slope)
}
getMetric()
## [1] -0.0618848
######################################
shuffleData <- function(z=NULL){
if(is.null(z)){
xVar <- c(iris$Sepal.Length)
yVar <- c(iris$Sepal.Width)
z <- data.frame(ID=seq_along
(xVar), xVar, yVar)}
z[,3] <- sample(z[,3])
return(z)
}
shuffleData()
#########################################
#Function: getPVAL
# calculate p value for observed, simulated data
# input: list of observed metric and vector of simulated metric
# output: lower, upper tail probability vector
getPval <- function(z=NULL) {
if(is.null(z)){
z <- list(xObs=runif(1), xSim=runif(1000))}
pLower <- mean(z[[2]] <=z[[1]])
pUpper <- mean(z[[2]] >=z[[1]])
return(c(pL=pLower, pU= pUpper))
}
getPval()
## pL pU
## 0.504 0.496
#--------------------------------------- Main Body
nSim <- 10000 # number of simulations
Xsim <- rep(NA, nSim) #will hold simulated slopes
dF <- readData()
Xobs <- getMetric(dF)
for (i in seq_len(nSim)) {
Xsim[i] <- getMetric(shuffleData(dF)) }
slopes <- list(Xobs,Xsim)
getPval(slopes)
## pL pU
## 0.0759 0.9241
#------------------------------------------
## FUNCTION: plotRanTest
## ggplot graph
## input : list of observed metric and vector of the simulated metric
## output: ggplot graph
plotRantTest <- function(z=NULL){
if(is.null(z)){
z <- list(xObs=runif(1), xSim=runif(1000))}
df <- data.frame(ID= seq_along(z[[2]]),
simX=z[[2]])
p1 <- ggplot(data=df, mapping=aes(x=simX))
p1 + geom_histogram(mapping=aes(fill=I("mistyrose1"), color=I("black"))) +
geom_vline(aes(xintercept=z[[1]], col="blue"))
}
plotRantTest()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Question 3: For comparison, calculate in R the standard statistical analysis you would use with these data. How does the p-value compare for the standard test versus the p value you estimated from your randomization test? If the p values seem very different, run the program again with a different starting seed (and/or increase the number of replications in your randomization test). If there are persistent differences in the p value of the standard test versus your randomization, what do you think is responsible for this difference?
In both the observed the observed and randomized data sets the p-value is the same. This indicates that there may be an error in the function itself. We would expect to see different p-values due to the randomization that occurs.
lmiris <- lm(iris$Sepal.Length ~ iris$Sepal.Width, data = readData())
summary(lmiris)
##
## Call:
## lm(formula = iris$Sepal.Length ~ iris$Sepal.Width, data = readData())
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5561 -0.6333 -0.1120 0.5579 2.2226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5262 0.4789 13.63 <2e-16 ***
## iris$Sepal.Width -0.2234 0.1551 -1.44 0.152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8251 on 148 degrees of freedom
## Multiple R-squared: 0.01382, Adjusted R-squared: 0.007159
## F-statistic: 2.074 on 1 and 148 DF, p-value: 0.1519
lmirisShuffle <- lm(iris$Sepal.Length ~ iris$Sepal.Width, data = shuffleData())
summary(lmirisShuffle)
##
## Call:
## lm(formula = iris$Sepal.Length ~ iris$Sepal.Width, data = shuffleData())
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5561 -0.6333 -0.1120 0.5579 2.2226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5262 0.4789 13.63 <2e-16 ***
## iris$Sepal.Width -0.2234 0.1551 -1.44 0.152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8251 on 148 degrees of freedom
## Multiple R-squared: 0.01382, Adjusted R-squared: 0.007159
## F-statistic: 2.074 on 1 and 148 DF, p-value: 0.1519