# Load Libraries ----
# this is done each time you run a script
library("readxl") # read in excel files
library("tidyverse") # dplyr and piping and ggplot etc
library("lubridate") # dates and times
library("scales") # scales on ggplot ases
library("skimr") # quick summary stats
library("janitor") # clean up excel imports
library("patchwork") # multipanel graphs
# So now we have seen how to look at the data
# What if we wanted to modify the data in terms of overall structure
# If you were typing in data this might be how it looks
# Read in wide dataframe ----
lakes.df <- read_csv("data/reduced_lake_long_genus_species_wide.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## permanent_id = col_double(),
## lake_name = col_character(),
## date = col_date(format = ""),
## year = col_double(),
## Cladoceran_Bosmina_longirostris = col_double(),
## Cladoceran_Daphnia_catawba = col_double(),
## Cladoceran_Daphnia_pulex = col_double(),
## Cladoceran_Diaphanosoma_birgei = col_double(),
## Cladoceran_Diaphanosoma_brachyurum = col_double(),
## Cladoceran_Holopedium_giberum = col_double(),
## Copepod_Aglaodiaptomus_leptpus = col_double(),
## Copepod_Cyclops_scutifer = col_double(),
## Copepod_Epischura_lacustris = col_double(),
## Copepod_Leptodiaptomus_minutus = col_double(),
## Copepod_Mesocyclops_edax = col_double(),
## Copepod_Tropocyclops_extensus = col_double()
## )
head(lakes.df)
## # A tibble: 6 x 16
## permanent_id lake_name date year Cladoceran_Bosmina… Cladoceran_Daphni…
## <dbl> <chr> <date> <dbl> <dbl> <dbl>
## 1 47723283 Willis 1994-06-28 1994 0 0.00403
## 2 47723283 Willis 1994-07-27 1994 0 0
## 3 47723283 Willis 1994-08-31 1994 0 0.0356
## 4 47723283 Willis 1995-08-09 1995 0 0
## 5 47723283 Willis 1995-09-12 1995 0 0
## 6 47723283 Willis 1996-06-17 1996 0 0
## # … with 10 more variables: Cladoceran_Daphnia_pulex <dbl>,
## # Cladoceran_Diaphanosoma_birgei <dbl>,
## # Cladoceran_Diaphanosoma_brachyurum <dbl>,
## # Cladoceran_Holopedium_giberum <dbl>, Copepod_Aglaodiaptomus_leptpus <dbl>,
## # Copepod_Cyclops_scutifer <dbl>, Copepod_Epischura_lacustris <dbl>,
## # Copepod_Leptodiaptomus_minutus <dbl>, Copepod_Mesocyclops_edax <dbl>,
## # Copepod_Tropocyclops_extensus <dbl>
How to convert a wide dataframe to a long data frame. This is the older approach which has since been replaced by pivot_longer. Gather is still something I use a lot.
# Convert wide to long format ----
# this is an older method that is simple
lakes_long.df <- lakes.df %>%
gather(genus_species, # this will make a new column group with the column names
org_l, # this will make a column of data for the counts per liter
-lake_name, -date, -permanent_id, -year # "-" means leave alone
)
This is the same as above and does the same thing.
# this is the newer way that might be better
lakes_long.df <- lakes.df %>%
pivot_longer(
-c(lake_name, date, permanent_id, year),
names_to = "genus_species",
values_to = "org_l")
This spreads a long dataframe into a wide fomat dataframe.
# Long to Wide format ----
# the older method
lakes_wide.df <- lakes_long.df %>%
spread(
genus_species, org_l
)
This is the same as spread using a slightly newer terminology
# now the newer method
lakes_wide.df <- lakes_long.df %>%
pivot_wider(
id_cols = c( "lake_name", "date" ),
names_from = genus_species,
values_from = org_l)
Lets read in a new version of the file and do some more work
# now the new version of modification can also do cool stuff like
# summarizing data
# lets read in a new file to add some complexity for fun
lakes.df <- read_csv("data/reduced_lake_long_genus_species.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## permanent_id = col_double(),
## lake_name = col_character(),
## date = col_date(format = ""),
## group = col_character(),
## genus_species = col_character(),
## org_l = col_double(),
## year = col_double()
## )
Here we remove the genus species column in preparation to summarize the data
# lets simplify the data
lakes.df <- lakes.df %>% select(-genus_species)
We can also use the pivot_wider or pivot_longer to summarize data
# make wide with column headers of group and summarize by lake and date
group_wide.df <- lakes.df %>%
pivot_wider(names_from = group,
values_from = org_l,
values_fn = list(org_l = mean, na.rm=TRUE))