# install.packages("devtools")
# devtools::install_github("thomasp85/patchwork")
# load the libraries each time you restart R
library("readxl") # read in excel files
library("tidyverse") # dplyr and piping and ggplot etc
library("lubridate") # dates and times
library("scales") # scales on ggplot ases
library("skimr") # quick summary stats
library("janitor") # clean up excel imports
library("patchwork") # multipanel graphs
library(skimr) # great way to do summary stats
##Read files
# lets read in a new file to add some complexity for fun
mm.df <- read_csv("data/mms.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## center = col_character(),
## color = col_character(),
## diameter = col_double(),
## mass = col_double()
## )
Lets look at a few ways to get summary statistics The first is the simplist and uses base R
summary(mm.df)
## center color diameter mass
## Length:816 Length:816 Min. :11.23 Min. :0.72
## Class :character Class :character 1st Qu.:13.22 1st Qu.:0.86
## Mode :character Mode :character Median :13.60 Median :0.92
## Mean :14.17 Mean :1.42
## 3rd Qu.:15.30 3rd Qu.:1.93
## Max. :17.88 Max. :3.62
A better way is using Skimr
mm.df %>%
skim()
Name | Piped data |
Number of rows | 816 |
Number of columns | 4 |
_______________________ | |
Column type frequency: | |
character | 2 |
numeric | 2 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
center | 0 | 1 | 5 | 13 | 0 | 3 | 0 |
color | 0 | 1 | 3 | 6 | 0 | 6 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
diameter | 0 | 1 | 14.17 | 1.22 | 11.23 | 13.22 | 13.60 | 15.30 | 17.88 | ▁▇▂▃▁ |
mass | 0 | 1 | 1.42 | 0.71 | 0.72 | 0.86 | 0.92 | 1.93 | 3.62 | ▇▂▂▂▁ |
The cool part of skimr is that you can do groups
mm.df %>%
group_by(center) %>%
skim()
Name | Piped data |
Number of rows | 816 |
Number of columns | 4 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 2 |
________________________ | |
Group variables | center |
Variable type: character
skim_variable | center | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|---|
color | peanut | 0 | 1 | 3 | 6 | 0 | 6 | 0 |
color | peanut butter | 0 | 1 | 3 | 6 | 0 | 6 | 0 |
color | plain | 0 | 1 | 3 | 6 | 0 | 6 | 0 |
Variable type: numeric
skim_variable | center | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|
diameter | peanut | 0 | 1 | 14.77 | 0.98 | 12.45 | 14.13 | 14.69 | 15.47 | 17.88 | ▂▇▇▃▁ |
diameter | peanut butter | 0 | 1 | 15.77 | 0.63 | 13.91 | 15.32 | 15.72 | 16.19 | 17.61 | ▁▅▇▃▁ |
diameter | plain | 0 | 1 | 13.28 | 0.34 | 11.23 | 13.08 | 13.28 | 13.48 | 14.38 | ▁▁▃▇▁ |
mass | peanut | 0 | 1 | 2.60 | 0.34 | 1.93 | 2.36 | 2.58 | 2.81 | 3.62 | ▃▇▆▃▁ |
mass | peanut butter | 0 | 1 | 1.80 | 0.27 | 1.19 | 1.62 | 1.77 | 1.94 | 2.63 | ▂▇▇▂▁ |
mass | plain | 0 | 1 | 0.86 | 0.05 | 0.72 | 0.83 | 0.87 | 0.89 | 1.01 | ▁▃▇▃▁ |
Finally you can get a summary a differnt way but is a bit longer
mm.df %>%
group_by(center, color) %>%
summarize(mean_diamter = mean(diameter, na.rm=TRUE),
mean_mass = mean(mass, na.rm=TRUE))
## `summarise()` has grouped output by 'center'. You can override using the `.groups` argument.
## # A tibble: 18 x 4
## # Groups: center [3]
## center color mean_diamter mean_mass
## <chr> <chr> <dbl> <dbl>
## 1 peanut blue 14.8 2.58
## 2 peanut brown 14.7 2.57
## 3 peanut green 15.0 2.68
## 4 peanut orange 14.6 2.57
## 5 peanut red 15.0 2.63
## 6 peanut yellow 14.5 2.57
## 7 peanut butter blue 15.9 1.85
## 8 peanut butter brown 15.7 1.80
## 9 peanut butter green 16.0 1.92
## 10 peanut butter orange 15.7 1.73
## 11 peanut butter red 15.8 1.74
## 12 peanut butter yellow 15.7 1.74
## 13 plain blue 13.2 0.860
## 14 plain brown 13.3 0.871
## 15 plain green 13.3 0.870
## 16 plain orange 13.3 0.865
## 17 plain red 13.3 0.854
## 18 plain yellow 13.4 0.865
These are all well and good but looking at a graph is cool.
# now for the plot
ggplot(mm.df, aes(color, diameter, color=color)) +
stat_summary(fun = mean, na.rm = TRUE,
geom = "point",
size = 3) +
stat_summary(fun.data = mean_se, na.rm = TRUE,
geom = "errorbar",
width = 0.2) +
labs(x = "color", y = "Diamter (units")
We can also add in shape as a grouping varaible for the center of the m&m’s
# now for the plot
ggplot(mm.df, aes(color, diameter, group=center, shape=center, color=color)) +
stat_summary(fun = mean, na.rm = TRUE,
geom = "point",
size = 3) +
stat_summary(fun.data = mean_se, na.rm = TRUE,
geom = "errorbar",
width = 0.2) +
labs(x = "color", y = "Diamter (units)")