Loading the Data

I used a random data file for ease of use.

library(MASS)
library(ggplot2)
library(haven) 
driving <- read_dta("driving_2004-2.dta")

Plotting the Initial Histogram

driving2 <- data.frame(driving$totfat)
                       names(driving2) <- c("totalfatal")
      
mean(driving2)

## Warning in mean.default(driving2): argument is not numeric or logical:
## returning NA

## [1] NA

p1 <- ggplot(data=driving2, aes(x=totalfatal, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Adding the Empirical Density Curve

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Get Maximum Likelihood Parameters for Normal

normPars <- fitdistr(driving$totfat,"normal")
print(normPars)

##      mean         sd    
##   882.29167   841.90899 
##  (121.51910) ( 85.92698)

normPars$estimate["mean"]

##     mean 
## 882.2917

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 882 842
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 121.5 85.9
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 14767 0 0 7383
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 48
##  $ loglik  : num -391
##  - attr(*, "class")= chr "fitdistr"

Generating Different Distributions

Normal Distribution

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(driving$totfat),len=length(driving$totfat))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(driving$totfat), args = list(mean = meanML, sd = sdML))
print(p1 + stat)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot Exponential Probability Distribution

expoPars <- fitdistr(driving$totfat,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(driving$totfat), args = list(rate=rateML))
print(p1 + stat + stat2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Uniform Probability Distribution

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(driving$totfat), args = list(min=min(driving$totfat), max=max(driving$totfat)))
p1 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Gamma Distribution

gammaPars <- fitdistr(driving$totfat,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(driving$totfat), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Beta Distribution

pSpecial <- ggplot(data=driving2, aes(x=driving$totfat/(max(driving$totfat + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=driving$totfat/max(driving$totfat + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(driving$totfat), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Compare Log Likelihoods

We want to find which distribution best fits the data. The higher the number the better the fit.

normPars$loglik

## [1] -391.4213

gammaPars$loglik

## [1] -371.9341

expoPars$loglik

## [1] -373.5611

Gamma is the Best Fit!

Create Simulated Data

Creating simulated data based on the parameters above.

z <- rgamma(n= 48, shape=shapeML, rate=rateML) 
z <- data.frame(1:48,z)

names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)

## 'data.frame':    48 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ myVar: num  605 2137 166 1109 442 ...

p5 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

normPars <- fitdistr(z$myVar,"normal")
print(normPars)

##      mean         sd    
##   738.47356   608.75477 
##  ( 87.86618) ( 62.13077)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 738 609
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 87.9 62.1
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 7720 0 0 3860
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 48
##  $ loglik  : num -376
##  - attr(*, "class")= chr "fitdistr"

gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p5 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We can see that the simulated data given the parameters doesn’t exactly match our original data set. However, it is fairly good for simulated data!

Probability Distributions in R

Luke Daniels

2/21/2018