[Tidyverse + MOSAIC]

http://mosaicdatabase.web.ox.ac.uk

14 March 2022

Vignette #2 - Tidyverse & MOSAIC

MOSAIC was built for use and integration with other datasets for comparative demography. Demographers are often interested in how traits scale across different taxonomic levels. The Tidyverse https://www.tidyverse.org/ collection of R packages (Dplyr in particular) are commonly used by ecologists to simplify workflows that subset and manipulate data. Here, we walk throught the basics of how Tidy tools can be use with the MOSAIC database to quickly isolate specific groups of organism or classes of records.

library(tidyverse) 

Accessing MOSAIC

Download MOSAIC from the mosaic portal. For more information on the basics of downloading MOSAIC and navigating the data structure, see: Vignette #1:

remotes::install_github("mosaicdatabase/Rmosaic")
library(Rmosaic)
mosaic <- mos_fetch("v1.0.0")

Using Tidyverse with MOSAIC

Tidyverse allows for the easily filtering of outliers and NAs, and the isolation of records to particular groups of taxa in MOSAIC. For example, one can isolate the differences in biomass between birds and mammals and see how the decomposition of variance in biomass between the groups informs the representation of biomass values in MOSAIC

# Bind the ID field, species, taxa, and biomass fields into a unified dataframe

biomass_table <- cbind(mosaic@mosaicID,
                       mosaic@species,
                       mosaic@taxaMetadat,
                       mosaic@biomass@value)

# Relabel the biomass field

names(biomass_table)[length(biomass_table)] <- "biomass"

# Coerce the field into a numeric values - transforms "NDY" into "NA"

biomass_table$biomass <- as.numeric(biomass_table$biomass)

# Removing an outlier and filtering out NAs

biomass_all <- biomass_table %>% 
  drop_na(biomass) %>%
  filter(biomass < 12000000)

# Filter to mammals

biomass_mamm <- biomass_table %>% 
  filter(Class == "Mammalia") %>%
  drop_na(biomass) # Remove results that are NA

# Filter to birds

biomass_aves <- biomass_table %>% 
  filter(Class == "Aves") %>%
  drop_na(biomass)

# Plot all biomass

plot(biomass_all$`mosaic@mosaicID`, log(biomass_all$biomass), pch=16,
     xlab = "MOSAIC ID index", ylab = "Log Biomass", main = "Biomass")

# Highlight points for mammals

points(biomass_mamm$`mosaic@mosaicID`, log(biomass_mamm$biomass), pch=16, col="red")

legend(2085, 16, legend=c("Mammals", "All Classes"),
       col=c("red", "black"), lty=1, cex=0.8)

# Generate histograms for all species on the same binning scale

Bin_all <- hist(log(biomass_all$biomass),
                breaks=seq(min(log(biomass_all$biomass)), max(log(biomass_all$biomass)),
                           length=15), main="Biomass (All Species)")

Bin_mamm <- hist(log(biomass_mamm$biomass),
                breaks=seq(min(log(biomass_all$biomass)), max(log(biomass_all$biomass)),
                           length=15), main = "Mammal Biomass")

Bin_aves <- hist(log(biomass_aves$biomass),
                 breaks=seq(min(log(biomass_all$biomass)), max(log(biomass_all$biomass)),
                            length=15), main = "Aves Biomass")

# Ploting frequency-density curves for species

plot(Bin_all$mids, Bin_all$counts, pch=16, type="l", col=alpha("black", 0.5), lwd=2, lty=2,
     ylim=c(0,12), ylab="record frequency", xlab="log biomass", main="Biomass by Class")
points(Bin_mamm$mids, Bin_mamm$counts, pch=16, type="l", col=alpha("red", 0.5), lwd=2)
points(Bin_aves$mids, Bin_aves$counts, pch=16, type="l", col=alpha("blue", 0.5), lwd=3)
legend(12, 12.5, legend=c("Mammals", "Birds", "All Classes"),
       col=c("red", "blue", "gray"), lty=1:2, cex=0.8)

# Note how the peak in records of large mammals is driven by mammals, whereas the high record count in smaller values is sustained on the aves records. Individually, the biomass distributions are semi-normal, whereas the joint distribution is more complex.

Alternatively, the same data could be plotted by group with boxplots, illustrating the same sotry.

bio_mamm.aves <- cbind(log(biomass_all$biomass), log(biomass_mamm$biomass), log(biomass_aves$biomass))
colnames(bio_mamm.aves) <- c("all", "mammals", "birds")
boxplot(bio_mamm.aves, ylab="log biomass", xlab="class", main="Biomass by Class")

Factor variables can similarly be parsed by class.

# Bind the ID field, species, taxa, and volancy fields into a unified dataframe

bio_vol <- cbind(mosaic@mosaicID,
                 mosaic@species,
                 mosaic@taxaMetadat,
                 mosaic@volancy@value)

# Relabel the volancy field

names(bio_vol)[length(bio_vol)] <- "volancy"

# Assign into a factor class

bio_vol$volancy <- as.factor(bio_vol$volancy)

# Filter out NDYs

vol_all <- bio_vol %>% 
  filter(!volancy == "NDY")

# Filter to Mammals

vol_mamm <- bio_vol %>% 
  filter(Class == "Mammalia") %>%
  filter(!volancy == "NDY")

# Filter to birds

vol_aves <- bio_vol %>% 
  filter(Class == "Aves") %>%
  filter(!volancy == "NDY")

# Look at the summary of factors to counts

summary(vol_mamm$volancy)
##         NDY  Non-volant Semi-volant      Volant 
##           0          37           1           0
summary(vol_aves$volancy)
##         NDY  Non-volant Semi-volant      Volant 
##           0           3           0          10
# Store summaries

vol_all_summary <- summary(vol_all$volancy)
vol_mamm_summary <- summary(vol_mamm$volancy)
vol_aves_summary <- summary(vol_aves$volancy)

# Plot the points

plot(vol_all_summary[2:4],  type="b", pch=16, col=alpha("black", 0.5),
     ylab="record count", xlab="Volancy (1 = Nonvolant; 2 = Semivolant; 3 = Volant)", main="Volancy by Class")
points(vol_mamm_summary[2:4], type="b", pch=16, col=alpha("red", 0.5))
points(vol_aves_summary[2:4], type="b", pch=16, col=alpha("blue", 0.5))
legend(2.5, 85, legend=c("Mammals", "Birds", "All Classes"),
       col=c("red", "blue", "gray"), lty=1:2, cex=0.8)

If one is more generally interested in the class breakdown of factors across a trait field without a priori interest in a specific variable, one can take advantage of the tools.

# Evaluating the variance breakdown within a given class

Volant_sum <- bio_vol %>% 
  filter(volancy=="Volant")%>%
  group_by(Class, volancy)%>%
  count()

Semivolant_sum <- bio_vol %>% 
  filter(volancy=="Semi-volant")%>%
  group_by(Class, volancy)%>%
  count()

Nonvolant_sum <- bio_vol %>% 
  filter(volancy=="Non-volant")%>%
  group_by(Class, volancy)%>%
  count()

volTable <- Nonvolant_sum # Generating DF

volTable <- volTable[,-2]
colnames(volTable)[2] <- "Nonvolant"
volTable[,3] <- rep(0, length(Nonvolant_sum$Class))
volTable[,4] <- rep(0, length(Nonvolant_sum$Class))
colnames(volTable)[3] <- "Volant"
colnames(volTable)[4] <- "Semivolant"
volTable[match(Volant_sum$Class, Nonvolant_sum$Class),3] <- Volant_sum$n
volTable[match(Semivolant_sum$Class, Nonvolant_sum$Class),4] <- Semivolant_sum$n

volTable
## # A tibble: 13 × 4
## # Groups:   Class [13]
##    Class          Nonvolant Volant Semivolant
##    <chr>              <int>  <dbl>      <dbl>
##  1 Actinopterygii         9      0          0
##  2 Amphibia               1      0          0
##  3 Anthozoa               5      0          0
##  4 Aves                   3     10          0
##  5 Bivalvia               3      0          0
##  6 Branchiopoda           2      0          0
##  7 Demospongiae           1      0          0
##  8 Elasmobranchii         1      0          0
##  9 Gastropoda             3      0          0
## 10 Insecta                2      0          0
## 11 Mammalia              37      0          1
## 12 Reptilia               8      0          0
## 13 <NA>                   7      0          0