I used a random data file for ease of use.
library(MASS)
library(ggplot2)
library(haven)
driving <- read_dta("driving_2004-2.dta")
driving2 <- data.frame(driving$totfat)
names(driving2) <- c("totalfatal")
mean(driving2)
## Warning in mean.default(driving2): argument is not numeric or logical:
## returning NA
## [1] NA
p1 <- ggplot(data=driving2, aes(x=totalfatal, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
normPars <- fitdistr(driving$totfat,"normal")
print(normPars)
## mean sd
## 882.29167 841.90899
## (121.51910) ( 85.92698)
normPars$estimate["mean"]
## mean
## 882.2917
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 882 842
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 121.5 85.9
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 14767 0 0 7383
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 48
## $ loglik : num -391
## - attr(*, "class")= chr "fitdistr"
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(driving$totfat),len=length(driving$totfat))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(driving$totfat), args = list(mean = meanML, sd = sdML))
print(p1 + stat)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
expoPars <- fitdistr(driving$totfat,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(driving$totfat), args = list(rate=rateML))
print(p1 + stat + stat2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(driving$totfat), args = list(min=min(driving$totfat), max=max(driving$totfat)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gammaPars <- fitdistr(driving$totfat,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(driving$totfat), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pSpecial <- ggplot(data=driving2, aes(x=driving$totfat/(max(driving$totfat + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=driving$totfat/max(driving$totfat + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(driving$totfat), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We want to find which distribution best fits the data. The higher the number the better the fit.
normPars$loglik
## [1] -391.4213
gammaPars$loglik
## [1] -371.9341
expoPars$loglik
## [1] -373.5611
Gamma is the Best Fit!
Creating simulated data based on the parameters above.
z <- rgamma(n= 48, shape=shapeML, rate=rateML)
z <- data.frame(1:48,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 48 obs. of 2 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ myVar: num 605 2137 166 1109 442 ...
p5 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 738.47356 608.75477
## ( 87.86618) ( 62.13077)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 738 609
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 87.9 62.1
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 7720 0 0 3860
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 48
## $ loglik : num -376
## - attr(*, "class")= chr "fitdistr"
gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p5 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We can see that the simulated data given the parameters doesn’t exactly match our original data set. However, it is fairly good for simulated data!