Load libraries

# Load Libraries ----
# this is done each time you run a script
library("readxl") # read in excel files
library("tidyverse") # dplyr and piping and ggplot etc
library("lubridate") # dates and times
library("scales") # scales on ggplot ases
library("skimr") # quick summary stats
library("janitor") # clean up excel imports
library("patchwork") # multipanel graphs

Read in files

# So now we have seen how to look at the data
# What if we wanted to modify the data in terms of overall structure

# If you were typing in data this might be how it looks
# Read in wide dataframe ----
lakes.df <- read_csv("data/reduced_lake_long_genus_species_wide.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   permanent_id = col_double(),
##   lake_name = col_character(),
##   date = col_date(format = ""),
##   year = col_double(),
##   Cladoceran_Bosmina_longirostris = col_double(),
##   Cladoceran_Daphnia_catawba = col_double(),
##   Cladoceran_Daphnia_pulex = col_double(),
##   Cladoceran_Diaphanosoma_birgei = col_double(),
##   Cladoceran_Diaphanosoma_brachyurum = col_double(),
##   Cladoceran_Holopedium_giberum = col_double(),
##   Copepod_Aglaodiaptomus_leptpus = col_double(),
##   Copepod_Cyclops_scutifer = col_double(),
##   Copepod_Epischura_lacustris = col_double(),
##   Copepod_Leptodiaptomus_minutus = col_double(),
##   Copepod_Mesocyclops_edax = col_double(),
##   Copepod_Tropocyclops_extensus = col_double()
## )
head(lakes.df)
## # A tibble: 6 x 16
##   permanent_id lake_name date        year Cladoceran_Bosmina… Cladoceran_Daphni…
##          <dbl> <chr>     <date>     <dbl>               <dbl>              <dbl>
## 1     47723283 Willis    1994-06-28  1994                   0            0.00403
## 2     47723283 Willis    1994-07-27  1994                   0            0      
## 3     47723283 Willis    1994-08-31  1994                   0            0.0356 
## 4     47723283 Willis    1995-08-09  1995                   0            0      
## 5     47723283 Willis    1995-09-12  1995                   0            0      
## 6     47723283 Willis    1996-06-17  1996                   0            0      
## # … with 10 more variables: Cladoceran_Daphnia_pulex <dbl>,
## #   Cladoceran_Diaphanosoma_birgei <dbl>,
## #   Cladoceran_Diaphanosoma_brachyurum <dbl>,
## #   Cladoceran_Holopedium_giberum <dbl>, Copepod_Aglaodiaptomus_leptpus <dbl>,
## #   Copepod_Cyclops_scutifer <dbl>, Copepod_Epischura_lacustris <dbl>,
## #   Copepod_Leptodiaptomus_minutus <dbl>, Copepod_Mesocyclops_edax <dbl>,
## #   Copepod_Tropocyclops_extensus <dbl>

Gather to long format

How to convert a wide dataframe to a long data frame. This is the older approach which has since been replaced by pivot_longer. Gather is still something I use a lot.

# Convert wide to long format ----
# this is an older method that is simple
lakes_long.df <- lakes.df %>% 
  gather(genus_species, # this will make a new column group with the column names 
         org_l,  # this will make a column of data for the counts per liter
         -lake_name, -date, -permanent_id, -year # "-" means leave alone
  )

Pivot_longer to long format

This is the same as above and does the same thing.

# this is the newer way that might be better
lakes_long.df <- lakes.df %>%
  pivot_longer(
    -c(lake_name, date, permanent_id, year),
    names_to = "genus_species", 
    values_to = "org_l")

Spread

This spreads a long dataframe into a wide fomat dataframe.

# Long to Wide format ----
# the older method 
lakes_wide.df <- lakes_long.df %>%
  spread(
    genus_species, org_l
  )

Pivot_wider

This is the same as spread using a slightly newer terminology

# now the newer method
lakes_wide.df <- lakes_long.df %>%
  pivot_wider(
    id_cols = c( "lake_name", "date" ),
    names_from = genus_species,
    values_from = org_l)

Read in long format file

Lets read in a new version of the file and do some more work

# now the new version of modification can also do cool stuff like 
# summarizing data 
# lets read in a new file to add some complexity for fun
lakes.df <- read_csv("data/reduced_lake_long_genus_species.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   permanent_id = col_double(),
##   lake_name = col_character(),
##   date = col_date(format = ""),
##   group = col_character(),
##   genus_species = col_character(),
##   org_l = col_double(),
##   year = col_double()
## )

Here we remove the genus species column in preparation to summarize the data

# lets simplify the data
lakes.df <- lakes.df %>% select(-genus_species)

Pivot to summarize data

We can also use the pivot_wider or pivot_longer to summarize data

# make wide with column headers of group and summarize by lake and date
group_wide.df <- lakes.df %>%
  pivot_wider(names_from = group, 
              values_from = org_l,
              values_fn = list(org_l = mean, na.rm=TRUE))