Load libraries

As usual - load the necessary libraries.

# Leach et al reduced for GLEON Workshop

# Load Libraries ----
# this is done each time you run a script
library("readxl") # read in excel files
library("tidyverse") # dplyr and piping and ggplot etc
library("lubridate") # dates and times
library("scales") # scales on ggplot ases
library("skimr") # quick summary stats
library("janitor") # clean up excel imports
library("patchwork") # multipanel graphs

Read in files

Again read in a simplified file

# So now we have seen how to look at the data
# What if we wanted to modify the data in terms of columns or rows

# lets read in a new file to add some complexity for fun
lakes.df <- read_csv("data/reduced_lake_long_genus_species.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   permanent_id = col_double(),
##   lake_name = col_character(),
##   date = col_date(format = ""),
##   group = col_character(),
##   genus_species = col_character(),
##   org_l = col_double(),
##   year = col_double()
## )

head(lakes.df)

## # A tibble: 6 x 7
##   permanent_id lake_name date       group     genus_species          org_l  year
##          <dbl> <chr>     <date>     <chr>     <chr>                  <dbl> <dbl>
## 1     47723283 Willis    1996-06-17 Cladocer… Bosmina_longirostris   0      1996
## 2     47723283 Willis    1996-06-17 Cladocer… Daphnia_catawba        0      1996
## 3     47723283 Willis    1996-06-17 Cladocer… Daphnia_pulex          0      1996
## 4     47723283 Willis    1996-06-17 Cladocer… Diaphanosoma_birgei    0      1996
## 5     47723283 Willis    1996-06-17 Cladocer… Diaphanosoma_brachyu…  0.881  1996
## 6     47723283 Willis    1996-06-17 Cladocer… Holopedium_giberum    31.6    1996

Mutate - Modifying variables

Mutate allows you to do a lot of the heavy lifting to modify varaibles and will be somethign that is used a lot

# first lets look at how to modify a variable

lakes_modified.df <- lakes.df %>%
  mutate(log_org_l = log10(org_l +1))

# we can do essentially any math we want and do it within groups
# we will go over this more.

Select to remove/reorder columns

Boolean operators are really important here
* less than <
* greater than >
* less than or equal to <=
* greater than or equal to >=
* is equal to ==
* is not equal to !=
* inclusive of %in% c(“x”, “y”, “z”)

These can be combined with
& and
| or

# We can remove/retain/or reorder columns using select
# Reorder columns ----
lakes.df <- lakes.df %>%
  select(date, lake_name, permanent_id, group, genus_species, org_l)

You can also sort out a single column and use everything() to leave the rest.

# we can also reorder one column to the front
lakes.df <- lakes.df %>%
  select(lake_name, everything())

You can remove columns with the -sign in front of the variable

# we could remove columns
lakes.df <- lakes.df %>%
  select(-permanent_id)

You can also use starts_with() or ends_with() to sort out strings in code. There is a lot more you can do with this but we wont go into that here.

# you can also select columns with pattern matching
# starts_with or ends_with
lakes.df <- lakes.df %>%
  select(starts_with("g"), date, lake_name, everything()  )

Filtering data

We just saw how to reorder columns or remove them. Here we will go over how to filter data to remove rows based on different statements.

# Filtering data and counting data ----
# there are several boolean operators that are useful for filtering data
# we can use these to just see the data or we can use to 

# lets say we wanted to look at only one lake
lakes.df %>% filter(org_l >5) %>%  filter(lake_name == "Willis")

## # A tibble: 32 x 5
##    group      genus_species          date       lake_name org_l
##    <chr>      <chr>                  <date>     <chr>     <dbl>
##  1 Cladoceran Holopedium_giberum     1996-06-17 Willis    31.6 
##  2 Copepod    Leptodiaptomus_minutus 1996-06-17 Willis    19.6 
##  3 Cladoceran Bosmina_longirostris   1998-07-13 Willis     7.96
##  4 Cladoceran Bosmina_longirostris   2001-07-17 Willis     5.19
##  5 Cladoceran Bosmina_longirostris   2006-07-18 Willis     7.5 
##  6 Cladoceran Holopedium_giberum     2006-07-18 Willis     5.15
##  7 Copepod    Leptodiaptomus_minutus 2006-07-18 Willis    17.2 
##  8 Copepod    Leptodiaptomus_minutus 2005-07-19 Willis    11   
##  9 Copepod    Leptodiaptomus_minutus 1996-07-22 Willis    18.2 
## 10 Copepod    Mesocyclops_edax       1994-07-27 Willis    33.1 
## # … with 22 more rows

these could also be combined

Count

The count statement lets us explore parts of the data and see what the data looks like.

# lets look at some of the data using some simple methods
# how many lakes are there and how many 
lakes.df %>% count(lake_name)

## # A tibble: 4 x 2
##   lake_name     n
##   <chr>     <int>
## 1 Grass       348
## 2 Indian      348
## 3 South       348
## 4 Willis      324

# lets see how many genus species there are
lakes.df %>% count(genus_species)

## # A tibble: 12 x 2
##    genus_species               n
##    <chr>                   <int>
##  1 Aglaodiaptomus_leptpus    114
##  2 Bosmina_longirostris      114
##  3 Cyclops_scutifer          114
##  4 Daphnia_catawba           114
##  5 Daphnia_pulex             114
##  6 Diaphanosoma_birgei       114
##  7 Diaphanosoma_brachyurum   114
##  8 Epischura_lacustris       114
##  9 Holopedium_giberum        114
## 10 Leptodiaptomus_minutus    114
## 11 Mesocyclops_edax          114
## 12 Tropocyclops_extensus     114

# now this is odd all have the same N 
# lets look at what the data is a bit more
lakes.df %>% 
  group_by(genus_species) %>%
  filter(org_l==0) %>%
  count(genus_species)

## # A tibble: 12 x 2
## # Groups:   genus_species [12]
##    genus_species               n
##    <chr>                   <int>
##  1 Aglaodiaptomus_leptpus     87
##  2 Bosmina_longirostris       21
##  3 Cyclops_scutifer          113
##  4 Daphnia_catawba            42
##  5 Daphnia_pulex              88
##  6 Diaphanosoma_birgei       113
##  7 Diaphanosoma_brachyurum    66
##  8 Epischura_lacustris       112
##  9 Holopedium_giberum         55
## 10 Leptodiaptomus_minutus      3
## 11 Mesocyclops_edax            5
## 12 Tropocyclops_extensus      65

# So there are a lot of 0s - what if we removed that.
lakes.df %>% 
  filter(org_l != 0) %>%
  count(genus_species)

## # A tibble: 12 x 2
##    genus_species               n
##    <chr>                   <int>
##  1 Aglaodiaptomus_leptpus     27
##  2 Bosmina_longirostris       93
##  3 Cyclops_scutifer            1
##  4 Daphnia_catawba            72
##  5 Daphnia_pulex              26
##  6 Diaphanosoma_birgei         1
##  7 Diaphanosoma_brachyurum    48
##  8 Epischura_lacustris         2
##  9 Holopedium_giberum         59
## 10 Leptodiaptomus_minutus    111
## 11 Mesocyclops_edax          109
## 12 Tropocyclops_extensus      49

Ifelse and flagging of data

You can use the ifelse command to do a lot of basic flagging and modification of data

# Conditional flagging of outliers
# if else ----
# what if we wanted to flag all 0  values

lakes.df <- lakes.df %>%
  mutate(flag = ifelse(org_l==0, "ZERO", "NOT ZERO"))

Case_when - if else on steroids

The case_when statement allows a lot more flexibility

# case when
# we can do the same thing with case_when
lakes.df <- lakes.df %>%
  mutate(flag = case_when(org_l == 0 ~ "ZERO",
                          org_l >0 & org_l < 10  ~ "1 to 10",
                          org_l >=10 & org_l <100 ~ "10 to 100",
                          TRUE ~ "something else"))

Modifying dataframes using selet and filter

Bill Perry

2019/10/26