Again, we use these libraries almost all the time in every script
# Load Libraries ----
# this is done each time you run a script
library(readxl) # read in excel files
library(tidyverse) # dplyr and piping and ggplot etc
library(lubridate) # dates and times
library(scales) # scales on ggplot ases
library(skimr) # quick summary stats
library(janitor) # clean up excel imports
library(patchwork) # multipanel graphs
# lets read in a new file to add some complexity for fun
lakes.df <- read_csv("data/reduced_lake_long_genus_species.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## permanent_id = col_double(),
## lake_name = col_character(),
## date = col_date(format = ""),
## group = col_character(),
## genus_species = col_character(),
## org_l = col_double(),
## year = col_double()
## )
# Mutate - log
lakes_modified.df <- lakes.df %>%
mutate(log_org_l = log10(org_l + 1))
# Mutate and mean ----
lakes_modified.df <- lakes.df %>%
mutate(mean_org_l = mean(org_l, na.rm=TRUE))
# Mean by group ------
lakes_modified.df <- lakes.df %>%
group_by(group) %>%
mutate(mean_org_l = mean(org_l, na.rm=TRUE))
# how would you modify this to do the mean by group and lake?
lakes_modified.df <- lakes.df %>%
group_by(group) %>%
mutate(mean_org_l = mean(org_l, na.rm=TRUE))
# Mean and Standard Error -----
# there is no na.rm=TRUE for sum so we have to do some
# special things
lakes_modified.df <- lakes.df %>%
group_by(group) %>%
mutate(mean_org_l = mean(org_l, na.rm=TRUE),
se_org_l = sd(org_l, na.rm = T) / sqrt(sum(!is.na(org_l))))
# there are two ways...
# the first is do all of the math manually
lakes_summary.df <- lakes.df %>%
group_by(lake_name, group) %>%
summarize(mean_org_l = mean(org_l, na.rm=TRUE),
se_org_l = sd(org_l, na.rm = T) / sqrt(sum(!is.na(org_l))))
## `summarise()` has grouped output by 'lake_name'. You can override using the `.groups` argument.
lakes.df %>% group_by(lake_name, group) %>% skim(org_l)
Name | Piped data |
Number of rows | 1368 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
numeric | 1 |
________________________ | |
Group variables | lake_name, group |
Variable type: numeric
skim_variable | lake_name | group | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|---|
org_l | Grass | Cladoceran | 0 | 1 | 1.50 | 3.12 | 0 | 0 | 0.08 | 1.63 | 19.48 | ▇▁▁▁▁ |
org_l | Grass | Copepod | 0 | 1 | 4.88 | 9.47 | 0 | 0 | 0.00 | 4.51 | 46.06 | ▇▁▁▁▁ |
org_l | Indian | Cladoceran | 0 | 1 | 2.58 | 7.19 | 0 | 0 | 0.00 | 0.80 | 56.20 | ▇▁▁▁▁ |
org_l | Indian | Copepod | 0 | 1 | 3.21 | 6.72 | 0 | 0 | 0.07 | 1.96 | 34.22 | ▇▁▁▁▁ |
org_l | South | Cladoceran | 0 | 1 | 1.44 | 5.00 | 0 | 0 | 0.00 | 0.62 | 55.60 | ▇▁▁▁▁ |
org_l | South | Copepod | 0 | 1 | 3.27 | 7.76 | 0 | 0 | 0.00 | 2.04 | 56.03 | ▇▁▁▁▁ |
org_l | Willis | Cladoceran | 0 | 1 | 1.87 | 5.91 | 0 | 0 | 0.00 | 1.06 | 48.35 | ▇▁▁▁▁ |
org_l | Willis | Copepod | 0 | 1 | 2.35 | 7.00 | 0 | 0 | 0.00 | 0.73 | 57.34 | ▇▁▁▁▁ |
# this can be saved to a dataframe as well
skim.df <- lakes.df %>% dplyr::group_by(group) %>% skim(org_l)