T Tests

##Load libraries We will read in the main files and load the libraries as we have worked with so far.

# One new package for summary stats
#install.packages("broom")
# install.packages("GGally")
# install.packages("car")
# install.packages("gvlma")


# load the libraries each time you restart R
library(tidyverse)
library(lubridate)
library(scales)
library(skimr)
library(janitor)
library(patchwork)
# library(reshape2)
library(broom)
library(GGally)
library(corrplot)
library(car)

Read in file

# read in the file
iris.df <- read_csv("data/iris.csv") %>%
  clean_names() %>%
  remove_empty(c("rows", "cols")) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Sepal.Length = col_double(),
##   Sepal.Width = col_double(),
##   Petal.Length = col_double(),
##   Petal.Width = col_double(),
##   Species = col_character()
## )
glimpse(iris.df)
## Rows: 150
## Columns: 5
## $ sepal_length <dbl> 5.1, NA, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.8…
## $ sepal_width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.…
## $ petal_length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.…
## $ petal_width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.…
## $ species      <chr> "setosa", "setosa", "setosa", "setosa", "setosa", "setosa…

###Summary Statistics for the better look ###So this is a lot different than thinking about data from excel
Lets try to do the summary stats on the data now and see how it differs

# the data you want to look at
skim(iris.df)
Data summary
Name iris.df
Number of rows 150
Number of columns 5
_______________________
Column type frequency:
character 1
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
species 0 1 6 10 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
sepal_length 1 0.99 5.85 0.83 4.3 5.1 5.80 6.4 7.9 ▆▇▇▅▂
sepal_width 0 1.00 3.06 0.44 2.0 2.8 3.00 3.3 4.4 ▁▆▇▂▁
petal_length 0 1.00 3.76 1.77 1.0 1.6 4.35 5.1 6.9 ▇▁▆▇▂
petal_width 0 1.00 1.20 0.76 0.1 0.3 1.30 1.8 2.5 ▇▁▇▅▃
iris.df %>% group_by(species) %>% skim_to_wide()
## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
Data summary
Name .data
Number of rows 150
Number of columns 5
_______________________
Column type frequency:
numeric 4
________________________
Group variables species

Variable type: numeric

skim_variable species n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
sepal_length setosa 1 0.98 5.01 0.36 4.3 4.80 5.00 5.20 5.8 ▃▃▇▅▁
sepal_length versicolor 0 1.00 5.94 0.52 4.9 5.60 5.90 6.30 7.0 ▂▇▆▃▃
sepal_length virginica 0 1.00 6.59 0.64 4.9 6.23 6.50 6.90 7.9 ▁▃▇▃▂
sepal_width setosa 0 1.00 3.43 0.38 2.3 3.20 3.40 3.68 4.4 ▁▃▇▅▂
sepal_width versicolor 0 1.00 2.77 0.31 2.0 2.52 2.80 3.00 3.4 ▁▅▆▇▂
sepal_width virginica 0 1.00 2.97 0.32 2.2 2.80 3.00 3.18 3.8 ▂▆▇▅▁
petal_length setosa 0 1.00 1.46 0.17 1.0 1.40 1.50 1.58 1.9 ▁▃▇▃▁
petal_length versicolor 0 1.00 4.26 0.47 3.0 4.00 4.35 4.60 5.1 ▂▂▇▇▆
petal_length virginica 0 1.00 5.55 0.55 4.5 5.10 5.55 5.88 6.9 ▃▇▇▃▂
petal_width setosa 0 1.00 0.25 0.11 0.1 0.20 0.20 0.30 0.6 ▇▂▂▁▁
petal_width versicolor 0 1.00 1.33 0.20 1.0 1.20 1.30 1.50 1.8 ▅▇▃▆▁
petal_width virginica 0 1.00 2.03 0.27 1.4 1.80 2.00 2.30 2.5 ▂▇▆▅▇

Wide to long format

# this will add an index to the dataframe so you know what individual is which
iris_long.df <- iris.df %>% 
  mutate(individual = row_number()) %>% 
  gather(trait, measure, -species, - individual)

head(iris_long.df)
## # A tibble: 6 x 4
##   species individual trait        measure
##   <chr>        <int> <chr>          <dbl>
## 1 setosa           1 sepal_length     5.1
## 2 setosa           2 sepal_length    NA  
## 3 setosa           3 sepal_length     4.7
## 4 setosa           4 sepal_length     4.6
## 5 setosa           5 sepal_length     5  
## 6 setosa           6 sepal_length     5.4

Outliers

# Box Plots of data
iris_long.df %>% group_by(species, trait) %>% 
  ggplot( aes(x = trait, y = measure, color = species, fill=species))+
  geom_boxplot(aes(alpha=0.3)) 
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

These look good for the most part with only one or two significant. Your choice to transform or not.

##Factors
Make sure the categorical variable is a factor
Rearrange the order of groups so that control group is first, followed by treatment groups.
you can do this by:

# Make Factors from the different levels long way
# iris_long.df$species <- as.factor(iris_long.df$species)  
# iris_long.df$trait <- as.factor(iris_long.df$trait) 

# Make Factors dplyr
iris_long.df <- iris_long.df %>%
  mutate(
    sex = factor(species, 
    labels = c("setosa", "versicolor", "virginica")),
    trait = factor(trait, 
    labels = c("sepal_length", "sepal_width", "petal_length", "petal_width"))
  )

Test for normality of data and using the broom package

So I think this is premature but some people like to test the normality of the data but really you should be assessing the normality of the residuals. But here it goes…

# turn off scientific notaton
options(scipen = 999)
# to turn back on 
#options(scipen = 0)

# Test for normality of each group and store in shapirowilktests
# This uses the broom package to get clean output of the test 
iris_long.df %>% group_by(species, trait) %>% do(tidy(shapiro.test(.$measure)))
## # A tibble: 12 x 5
## # Groups:   species, trait [12]
##    species    trait        statistic     p.value method                     
##    <chr>      <fct>            <dbl>       <dbl> <chr>                      
##  1 setosa     sepal_length     0.955 0.0548      Shapiro-Wilk normality test
##  2 setosa     sepal_width      0.800 0.000000866 Shapiro-Wilk normality test
##  3 setosa     petal_length     0.977 0.456       Shapiro-Wilk normality test
##  4 setosa     petal_width      0.972 0.272       Shapiro-Wilk normality test
##  5 versicolor sepal_length     0.966 0.158       Shapiro-Wilk normality test
##  6 versicolor sepal_width      0.948 0.0273      Shapiro-Wilk normality test
##  7 versicolor petal_length     0.978 0.465       Shapiro-Wilk normality test
##  8 versicolor petal_width      0.974 0.338       Shapiro-Wilk normality test
##  9 virginica  sepal_length     0.962 0.110       Shapiro-Wilk normality test
## 10 virginica  sepal_width      0.960 0.0870      Shapiro-Wilk normality test
## 11 virginica  petal_length     0.971 0.258       Shapiro-Wilk normality test
## 12 virginica  petal_width      0.967 0.181       Shapiro-Wilk normality test
#You can do this on all variables faster with if there was only one grouping
# tapply(iris_long.df$measure, iris_long.df$species, shapiro.test)

##Test for homogenetiy of variances

Best to use the Levenes test compared to the Bartlet test
uses the car package

http://www.cookbook-r.com/Statistical_analysis/Homogeneity_of_variance/

Note however that this is doing the homogeneity test on all traits and not each one which is what you really should do and is not often done. Need to check on this though.

#Test for homogeneity of variances by groups
leveneTest(sepal_length ~ species, data=iris.df)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value   Pr(>F)   
## group   2  6.0386 0.003022 **
##       146                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Compared to the Bartlet test
bartlett.test(sepal_length ~ species, data=iris.df)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  sepal_length by species
## Bartlett's K-squared = 15.308, df = 2, p-value = 0.0004742

Create a subset dataframe

sepal_length.df <- iris_long.df %>%
  filter(trait =="sepal_length") %>%
  filter(species != "setosa")

If p-value >= 0.05, use var.equal=TRUE below

t.test(measure ~ species, data=sepal_length.df,
       var.equal=TRUE,
       conf.level=0.95)
## 
##  Two Sample t-test
## 
## data:  measure by species
## t = -12.604, df = 98, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group versicolor and group virginica is not equal to 0
## 95 percent confidence interval:
##  -1.495426 -1.088574
## sample estimates:
## mean in group versicolor  mean in group virginica 
##                    4.260                    5.552
t.test(measure ~ species, data=sepal_length.df,
       var.equal=FALSE,
       conf.level=0.95)
## 
##  Welch Two Sample t-test
## 
## data:  measure by species
## t = -12.604, df = 95.57, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group versicolor and group virginica is not equal to 0
## 95 percent confidence interval:
##  -1.49549 -1.08851
## sample estimates:
## mean in group versicolor  mean in group virginica 
##                    4.260                    5.552

Boxplot of data

sepal_length.df %>%
  ggplot(aes(species, measure, fill = species)) +
  geom_boxplot()

Mean and standard error plot

sepal_length.df %>% 
ggplot(aes(species, color=species)) + 
  stat_summary(aes(y = measure),
               fun.y = mean, na.rm = TRUE,
               geom = "point",
               size = 3) + 
  stat_summary(aes(y = measure),
               fun.data = mean_se, na.rm = TRUE,
               geom = "errorbar",
               width = 0.2) +
  labs(x = "Species", y = "Length") +
  scale_color_manual(name = "Species", 
                     values = c("blue", "red"),
                     labels = c("Versicolor", "Virginica"))
## Warning: `fun.y` is deprecated. Use `fun` instead.