As usual - load the necessary libraries.
# Leach et al reduced for GLEON Workshop
# Load Libraries ----
# this is done each time you run a script
library("readxl") # read in excel files
library("tidyverse") # dplyr and piping and ggplot etc
library("lubridate") # dates and times
library("scales") # scales on ggplot ases
library("skimr") # quick summary stats
library("janitor") # clean up excel imports
library("patchwork") # multipanel graphs
Again read in a simplified file
# So now we have seen how to look at the data
# What if we wanted to modify the data in terms of columns or rows
# lets read in a new file to add some complexity for fun
lakes.df <- read_csv("data/reduced_lake_long_genus_species.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## permanent_id = col_double(),
## lake_name = col_character(),
## date = col_date(format = ""),
## group = col_character(),
## genus_species = col_character(),
## org_l = col_double(),
## year = col_double()
## )
head(lakes.df)
## # A tibble: 6 x 7
## permanent_id lake_name date group genus_species org_l year
## <dbl> <chr> <date> <chr> <chr> <dbl> <dbl>
## 1 47723283 Willis 1996-06-17 Cladocer… Bosmina_longirostris 0 1996
## 2 47723283 Willis 1996-06-17 Cladocer… Daphnia_catawba 0 1996
## 3 47723283 Willis 1996-06-17 Cladocer… Daphnia_pulex 0 1996
## 4 47723283 Willis 1996-06-17 Cladocer… Diaphanosoma_birgei 0 1996
## 5 47723283 Willis 1996-06-17 Cladocer… Diaphanosoma_brachyu… 0.881 1996
## 6 47723283 Willis 1996-06-17 Cladocer… Holopedium_giberum 31.6 1996
Mutate allows you to do a lot of the heavy lifting to modify varaibles and will be somethign that is used a lot
# first lets look at how to modify a variable
lakes_modified.df <- lakes.df %>%
mutate(log_org_l = log10(org_l +1))
# we can do essentially any math we want and do it within groups
# we will go over this more.
Boolean operators are really important here
* less than <
* greater than >
* less than or equal to <=
* greater than or equal to >=
* is equal to ==
* is not equal to !=
* inclusive of %in% c(“x”, “y”, “z”)
These can be combined with
& and
| or
# We can remove/retain/or reorder columns using select
# Reorder columns ----
lakes.df <- lakes.df %>%
select(date, lake_name, permanent_id, group, genus_species, org_l)
You can also sort out a single column and use everything() to leave the rest.
# we can also reorder one column to the front
lakes.df <- lakes.df %>%
select(lake_name, everything())
You can remove columns with the -sign in front of the variable
# we could remove columns
lakes.df <- lakes.df %>%
select(-permanent_id)
You can also use starts_with() or ends_with() to sort out strings in code. There is a lot more you can do with this but we wont go into that here.
# you can also select columns with pattern matching
# starts_with or ends_with
lakes.df <- lakes.df %>%
select(starts_with("g"), date, lake_name, everything() )
We just saw how to reorder columns or remove them. Here we will go over how to filter data to remove rows based on different statements.
# Filtering data and counting data ----
# there are several boolean operators that are useful for filtering data
# we can use these to just see the data or we can use to
# lets say we wanted to look at only one lake
lakes.df %>% filter(org_l >5) %>% filter(lake_name == "Willis")
## # A tibble: 32 x 5
## group genus_species date lake_name org_l
## <chr> <chr> <date> <chr> <dbl>
## 1 Cladoceran Holopedium_giberum 1996-06-17 Willis 31.6
## 2 Copepod Leptodiaptomus_minutus 1996-06-17 Willis 19.6
## 3 Cladoceran Bosmina_longirostris 1998-07-13 Willis 7.96
## 4 Cladoceran Bosmina_longirostris 2001-07-17 Willis 5.19
## 5 Cladoceran Bosmina_longirostris 2006-07-18 Willis 7.5
## 6 Cladoceran Holopedium_giberum 2006-07-18 Willis 5.15
## 7 Copepod Leptodiaptomus_minutus 2006-07-18 Willis 17.2
## 8 Copepod Leptodiaptomus_minutus 2005-07-19 Willis 11
## 9 Copepod Leptodiaptomus_minutus 1996-07-22 Willis 18.2
## 10 Copepod Mesocyclops_edax 1994-07-27 Willis 33.1
## # … with 22 more rows
these could also be combined
The count statement lets us explore parts of the data and see what the data looks like.
# lets look at some of the data using some simple methods
# how many lakes are there and how many
lakes.df %>% count(lake_name)
## # A tibble: 4 x 2
## lake_name n
## <chr> <int>
## 1 Grass 348
## 2 Indian 348
## 3 South 348
## 4 Willis 324
# lets see how many genus species there are
lakes.df %>% count(genus_species)
## # A tibble: 12 x 2
## genus_species n
## <chr> <int>
## 1 Aglaodiaptomus_leptpus 114
## 2 Bosmina_longirostris 114
## 3 Cyclops_scutifer 114
## 4 Daphnia_catawba 114
## 5 Daphnia_pulex 114
## 6 Diaphanosoma_birgei 114
## 7 Diaphanosoma_brachyurum 114
## 8 Epischura_lacustris 114
## 9 Holopedium_giberum 114
## 10 Leptodiaptomus_minutus 114
## 11 Mesocyclops_edax 114
## 12 Tropocyclops_extensus 114
# now this is odd all have the same N
# lets look at what the data is a bit more
lakes.df %>%
group_by(genus_species) %>%
filter(org_l==0) %>%
count(genus_species)
## # A tibble: 12 x 2
## # Groups: genus_species [12]
## genus_species n
## <chr> <int>
## 1 Aglaodiaptomus_leptpus 87
## 2 Bosmina_longirostris 21
## 3 Cyclops_scutifer 113
## 4 Daphnia_catawba 42
## 5 Daphnia_pulex 88
## 6 Diaphanosoma_birgei 113
## 7 Diaphanosoma_brachyurum 66
## 8 Epischura_lacustris 112
## 9 Holopedium_giberum 55
## 10 Leptodiaptomus_minutus 3
## 11 Mesocyclops_edax 5
## 12 Tropocyclops_extensus 65
# So there are a lot of 0s - what if we removed that.
lakes.df %>%
filter(org_l != 0) %>%
count(genus_species)
## # A tibble: 12 x 2
## genus_species n
## <chr> <int>
## 1 Aglaodiaptomus_leptpus 27
## 2 Bosmina_longirostris 93
## 3 Cyclops_scutifer 1
## 4 Daphnia_catawba 72
## 5 Daphnia_pulex 26
## 6 Diaphanosoma_birgei 1
## 7 Diaphanosoma_brachyurum 48
## 8 Epischura_lacustris 2
## 9 Holopedium_giberum 59
## 10 Leptodiaptomus_minutus 111
## 11 Mesocyclops_edax 109
## 12 Tropocyclops_extensus 49
You can use the ifelse command to do a lot of basic flagging and modification of data
# Conditional flagging of outliers
# if else ----
# what if we wanted to flag all 0 values
lakes.df <- lakes.df %>%
mutate(flag = ifelse(org_l==0, "ZERO", "NOT ZERO"))
The case_when statement allows a lot more flexibility
# case when
# we can do the same thing with case_when
lakes.df <- lakes.df %>%
mutate(flag = case_when(org_l == 0 ~ "ZERO",
org_l >0 & org_l < 10 ~ "1 to 10",
org_l >=10 & org_l <100 ~ "10 to 100",
TRUE ~ "something else"))