minor edits post-workshop; edits to create pdf output (#16)

* edits following Mt Annan PlantBank workshop * reformatting to build pdf * additional edits to various sections of book that were unfinished --------- Co-authored-by: Daniel Falster <[email protected]>
traitecoevo · Dec 14, 2023 · 6ba0352 · 6ba0352
1 parent 7b5518c
commit 6ba0352
Show file tree

Hide file tree

Showing 29 changed files with 108,471 additions and 252 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,8 +5,12 @@
 .Ruserdata
 _book
 _freeze
+*_files
+figures/
 tmp
-
+*.html
 /.quarto/
 data/austraits/austraits-5.0.0.rds
 data/austraits/austraits.json
+site_libs
+index.tex
diff --git a/AusTraits_tutorial.qmd b/AusTraits_tutorial.qmd
diff --git a/_quarto.yml b/_quarto.yml
@@ -2,9 +2,9 @@ project:
   type: book
 
 book:
-  title: "The {traits.build} data standard, R package, and workflow"
-  author: ["Elizabeth Wenk", "Daniel Falster", "Sophie Yang", "Fonti Kar"]
-  page-footer: "How to get help: <https://traitecoevo.github.io/traits.build-book/help.html><br>Copyright 2023, Daniel Falster and Elizabeth Wenk"
+  title: "The {traits.build} data model, R package, and workflow"
+  author: "Elizabeth Wenk, Sophie Yang, Fonti Kar, Daniel Falster\n\nEvolution and Ecology Research Centre, \n\nThe University of New South Wales Sydney, NSW 2052\n\nAustralia"
+  page-footer: "How to get help: <https://traitecoevo.github.io/traits.build-book/help.html><br>Copyright 2023, Daniel Falster and Elizabeth Wenk" 
   page-navigation: true
   chapters:
     - part: Introduction
@@ -53,7 +53,6 @@ book:
     - part: Getting help
       chapters:
         - help.qmd
-        - debugging.qmd
 
   appendices:
     - csv.qmd
@@ -63,3 +62,13 @@ format:
   html:
     theme: cosmo
     number-depth: 2
+  pdf:
+    toc: true
+    number-sections: true
+    colorlinks: true
+    papersize: A4
+    geometry:
+      - top=20mm
+      - left=20mm
+      - heightrounded
+
diff --git a/adding_data_long.qmd b/adding_data_long.qmd
@@ -17,8 +17,6 @@ knitr::opts_chunk$set(
 )
 
 library(traits.build)
-
-my_kable_styling <- util_kable_styling_html
 ```
 
 ```{r, echo=FALSE, results='hide', message=FALSE}
@@ -292,7 +290,8 @@ Double check the information added to ensure:
 By default, details are added as the primary source. If multiple sources are linked to a single `dataset_id`, you can specify a source as `secondary`.
 
 ```{r, eval=FALSE, echo=TRUE}
-traits.build::metadata_add_source_doi(dataset_id = current_study, doi = "doi", type = "secondary")
+traits.build::metadata_add_source_doi(dataset_id = current_study, doi = "doi", 
+                                      type = "secondary")
 ```
 
 -   Attempting to add a second primary source will overwrite the information already input. Instead, if there is a third resource to add, use `type = "secondary_2"`
@@ -401,7 +400,8 @@ To allow custom modifications to a particular dataset before the common pipeline
 
 ```{r, eval=FALSE, echo=TRUE}
 data <-
-  readr::read_csv(filename_data_raw, col_types = cols(), guess_max = 100000, progress = FALSE) %>%
+  readr::read_csv(filename_data_raw, col_types = cols(), guess_max = 100000, 
+                  progress = FALSE) %>%
   process_custom_code(metadata[["dataset"]][["custom_R_code"]])()
 ```
 
@@ -526,8 +526,10 @@ If the dataset has location information, but lacks unique location names (or any
 ```{r, eval=FALSE, echo=TRUE}
 data %>%
   dplyr::mutate(
-    location_name = ifelse(location_name == "Mt Field" & habitat == "Montane rainforest", "Mt Field_wet", location_name),
-    location_name = ifelse(location_name == "Mt Field" & habitat == "Dry sclerophyll", "Mt Field_dry", location_name)
+    location_name = ifelse(location_name == "Mt Field" & habitat == "Montane rainforest", 
+                           "Mt Field_wet", location_name),
+    location_name = ifelse(location_name == "Mt Field" & habitat == "Dry sclerophyll", 
+                           "Mt Field_dry", location_name)
   )
 ```
 
@@ -542,7 +544,8 @@ data %>%
       longitude == 151.2917 ~ "diatreme"
     )
   )
-# Note with `dplyr::case_when`, any rows that do not match any of the conditions become `NA`'s.
+# Note with `dplyr::case_when`, 
+# any rows that do not match any of the conditions become `NA`'s.
 ```
 
 or
@@ -615,8 +618,10 @@ A particularly complicated example where some dates are presented as `yyyy-mm` a
 ```{r, eval=FALSE, echo=TRUE}
 data %>%
     dplyr::mutate(
-      weird_date = ifelse(stringr::str_detect(gathering_date, "^[0-9]{4}"), gathering_date, NA),
-      gathering_date = gathering_date %>% lubridate::mdy(quiet = T) %>% as.character(),
+      weird_date = ifelse(stringr::str_detect(gathering_date, "^[0-9]{4}"), 
+                          gathering_date, NA),
+      gathering_date = gathering_date %>% 
+          lubridate::mdy(quiet = T) %>% as.character(),
       gathering_date = coalesce(gathering_date, weird_date)
     ) %>%
     select(-weird_date)
@@ -712,7 +717,9 @@ Some examples of syntax to add `locations` data that exists in different formats
 locations <-
   check_custom_R_code(current_study) %>%
     dplyr::distinct(location_name, latitude, longitude, `veg type`) %>%
-    dplyr::rename(dplyr::all_of(c("latitude (deg)" = "latitude", "longitude (deg)" = "longitude", "description" = "veg type")))
+    dplyr::rename(dplyr::all_of(c("latitude (deg)" = "latitude",
+                                  "longitude (deg)" = "longitude", 
+                                  "description" = "veg type")))
 
 traits.build::metadata_add_locations(current_study, locations)
 ```
@@ -918,9 +925,11 @@ The trait details then need to be filled in manually.
   **NOTE**:   
   -   If the identical methods apply to a string of traits, for the first trait use the following syntax, where the `&leaf_length_method` notation assigns the remaining text in the field as the `leaf_length_method`.  
 
-  ```
-    methods: &leaf_length_method All measurements were from dry herbarium collections, with leaf and bracteole measurements taken from the largest of these structures on each specimen.
-  ```
+```
+  methods: &leaf_length_method All measurements were from dry herbarium 
+    collections, with leaf and bracteole measurements taken from the largest 
+    of these structures on each specimen.
+```
 
   Then for the next trait that uses this method you can just include. At the end of processing you can read/write the yml file and this will fill in the assigned text throughout.
 
@@ -1020,7 +1029,8 @@ metadata_add_substitutions_list(dataset_id, substitutions_to_add)
 taxonomic_updates:
 - find: Acacia ancistrophylla/sclerophylla
   replace: Acacia sp. [Acacia ancistrophylla/sclerophylla; White_2020]
-  reason: Rewording taxon where `/` indicates uncertain species identification to align with `APC accepted` genus (2022-11-10)
+  reason: Rewording taxon where `/` indicates uncertain species identification 
+    to align with `APC accepted` genus (2022-11-10)
   taxonomic_resolution: genus
 - find: Pimelea neo-anglica
   replace: Pimelea neoanglica
@@ -1036,7 +1046,8 @@ taxonomic_updates:
   taxonomic_resolution: genus
 - find: Polyalthia (Wyvur)
   replace: Polyalthia sp. (Wyvuri B.P.Hyland RFK2632)
-  reason: Fuzzy match alignment with species-level canonical name in `APC known` when everything except first 2 words ignored (2022-11-10)
+  reason: Fuzzy match alignment with species-level canonical name in `APC known` 
+    when everything except first 2 words ignored (2022-11-10)
   taxonomic_resolution: Species
 ```
 
@@ -1054,7 +1065,9 @@ The four elements it includes are:
 The function `metadata_add_taxonomic_change` adds single taxonomic updates directly into `metadata[["taxonomic_updates"]]`:
 
 ```{r, eval=FALSE, echo=TRUE}
-traits.build::metadata_add_taxonomic_change(current_study, "find", "replace", "reason", "taxonomic_resolution")
+traits.build::metadata_add_taxonomic_change(current_study, 
+                                            "find", "replace", "reason", 
+                                            "taxonomic_resolution")
 ```
 
 The function `metadata_add_taxonomic_changes_list` adds a table of taxonomic updates directly into `metadata[["taxonomic_updates"]]`. The column headers must be `find`, `replace`, `reason`, and `taxonomic_resolution`.
@@ -1172,8 +1185,8 @@ Fix as many errors as you can and then rerun `dataset_test()` repeatedly until n
 
 You may want to fix errors in tandem with [building the new dataset](#build_new_dataset), such as to be able to quickly compile a list of trait values requiring [substitutions](#add_substitutions) or taxon names requiring [taxonomic updates](#add_taxonomic_updates)
 
-See the [common issues](#data_common_issues.html) chapter for solutions to common issues, such as:  
--   dataset not [pivoting](#data_common_issues.html#cannot_pivot)  
+See the [common issues](data_common_issues.html) chapter for solutions to common issues, such as:  
+-   dataset not [pivoting](data_common_issues.html#cannot_pivot)  
 -   unsupported trait values  
 
 ### Rebuild AusTraits {#build_new_dataset}
@@ -1310,7 +1323,8 @@ At the very end, re-clear formatting, re-run tests, rebuild AusTraits, rebuild r
 To generate a report for a collection of studies:
 
 ```{r, eval=FALSE, echo=TRUE}
-traits.build::dataset_reports(database, c("Falster_2005_1", "Wright_2002"), overwrite = TRUE)
+traits.build::dataset_reports(database, c("Falster_2005_1", "Wright_2002"), 
+                              overwrite = TRUE)
 ```
 
 Or for all studies:

diff --git a/austraits_package.qmd b/austraits_package.qmd
@@ -5,7 +5,7 @@ knitr::opts_chunk$set(
   message = FALSE,
   warning = FALSE,
   comment = "#>",
-  fig.path = "tmp/figures/"
+  fig.path = "figures/"
 )
 
 library(dplyr)

diff --git a/check_dataset_functions.qmd b/check_dataset_functions.qmd
@@ -48,7 +48,9 @@ dataset_check_numeric_values <- function(database, dataset) {
         dataset_id == dataset,
         error == "Value out of allowable range"
       ) %>%
-      dplyr::select(dplyr::all_of(c("dataset_id", "trait_name", "value", "observation_id", "unit", "original_name")))
+      dplyr::select(
+        dplyr::all_of(c("dataset_id", "trait_name", "value", 
+                        "observation_id", "unit", "original_name")))
 
   out_of_range_values
 
@@ -68,7 +70,8 @@ dataset_check_taxonomic_updates <- function(taxon_list, database, dataset) {
 
   database$taxonomic_updates %>%
     dplyr::filter(dataset_id == dataset) %>%
-    dplyr::filter(!aligned_name %in% taxon_list$aligned_name & !aligned_name %in% taxon_list$taxon_name) %>%
+    dplyr::filter(!aligned_name %in% taxon_list$aligned_name,  
+                    !aligned_name %in% taxon_list$taxon_name) %>%
     dplyr::filter(is.na(taxonomic_resolution)) %>%
     dplyr::distinct(original_name)
 
@@ -82,7 +85,7 @@ dataset_check_taxonomic_updates <- function(taxon_list, database, dataset) {
 
 **output**: Table of trait measurements that are preventing the dataset from pivoting
 
-One of the automated tests in the function `dataset_test()` confirms the dataset can pivot from `longer` to `wider`. This tests is confirming that each row in the traits table has a unique combination of a particular 7 columns: `dataset_id`, `trait_name`, `observation_id`, `value_type`, `repeat_measurements_id`, `method_id`, `method_context_id`. When a dataset does not pivot wider, it is generally because `observation_id` has not been correctly parsed during the dataset processing. `observation_id` is an integral counter within a dataset that represents unique combinations of `taxon_name`, `population_id`, `individual_id`, `temporal_context_id`, `entity_type`, `life_stage`, `source_id`, `entity_context_id`, `basis_of_record`, `collection_date`, `original_name`. If any of these 11 pieces of metadata are incorrectly encoded in the metadata file, two distinct `observations` of traits might be assigned identical `observation_id`'s. The most likely culprits are forgetting to map in a context property or a column with `source_id`'s.
+One of the automated tests is the function `dataset_test()` confirms the dataset can pivot from `longer` to `wider`. This tests is confirming that each row in the traits table has a unique combination of a particular 7 columns: `dataset_id`, `trait_name`, `observation_id`, `value_type`, `repeat_measurements_id`, `method_id`, `method_context_id`. When a dataset does not pivot wider, it is generally because `observation_id` has not been correctly parsed during the dataset processing. `observation_id` is an integral counter within a dataset that represents unique combinations of `taxon_name`, `population_id`, `individual_id`, `temporal_context_id`, `entity_type`, `life_stage`, `source_id`, `entity_context_id`, `basis_of_record`, `collection_date`, `original_name`. If any of these 11 pieces of metadata are incorrectly encoded in the metadata file, two distinct `observations` of traits might be assigned identical `observation_id`'s. The most likely culprits are forgetting to map in a context property or a column with `source_id`'s.
 
 Overall, there are 17 separate columns that could be causing the `db_traits_pivot_wider()` test from `dataset_test()` to fail, making it difficult to discern where to trouble shoot. This function outputs a list of trait measurements that is causing the pivot test to fail, allowing you to hone in on the source of the problem.
 
@@ -96,12 +99,19 @@ dataset_check_not_pivoting <- function(database, dataset) {
       dplyr::select(
         # `taxon_name` and `original_name` are not needed for pivoting but are included for informative purposes
         dplyr::all_of(
-          c("dataset_id", "trait_name", "value", "taxon_name", "original_name", "observation_id",
-          "value_type", "repeat_measurements_id", "method_id", "method_context_id"))
+          c("dataset_id", "trait_name", "value", "taxon_name", "original_name", 
+          "observation_id", "value_type", "repeat_measurements_id", "method_id", 
+          "method_context_id"))
+      ) %>%
+      tidyr::pivot_wider(
+        names_from = "trait_name",
+        values_from = "value",
+        values_fn = length
       ) %>%
-      tidyr::pivot_wider(names_from = "trait_name", values_from = "value", values_fn = length) %>%
       tidyr::pivot_longer(cols = 9:ncol(.)) %>%
-      dplyr::rename(dplyr::all_of(c("trait_name" = "name", "number_of_duplicates" = "value"))) %>%
+      dplyr::rename(dplyr::all_of(
+        c("trait_name" = "name", "number_of_duplicates" = "value")
+        )) %>%
       dplyr::select(
         dplyr::all_of(c("dataset_id", "trait_name", "number_of_duplicates",
         "taxon_name", "original_name", "observation_id", "value_type")), everything()
@@ -155,11 +165,13 @@ dataset_check_outlier_by_species <- function(database, dataset, trait, multiplie
   need_review <- to_compare %>%
     dplyr::filter(trait_name == trait) %>%
     dplyr::filter(taxon_name %in% comparisons$taxon_name) %>%
-    dplyr::select(dplyr::all_of(c("taxon_name", "trait_name", "value", "observation_id", "unit", "original_name"))) %>%
+    dplyr::select(dplyr::all_of(c("taxon_name", "trait_name", "value", 
+                                  "observation_id", "unit", "original_name"))) %>%
     dplyr::left_join(
       by = c("taxon_name", "trait_name"),
       comparisons) %>%
-    dplyr::filter(as.numeric(value) > multiplier*mean_value | as.numeric(value) < (1/multiplier)*mean_value) %>%
+    dplyr::filter(as.numeric(value) > multiplier*mean_value | 
+                  as.numeric(value) < (1/multiplier)*mean_value) %>%
     dplyr::mutate(value_ratio = as.numeric(value)/mean_value) %>%
     dplyr::arrange(dplyr::desc(value_ratio))
 
@@ -180,12 +192,14 @@ dataset_check_outlier_by_genus <- function(database, dataset, trait, multiplier)
   to_compare <-
     database$traits %>%
     dplyr::filter(dataset_id == dataset) %>%
-    dplyr::left_join(by = c("taxon_name"), database$taxa %>% dplyr::select(dplyr::all_of(c("taxon_name", "genus"))))
+    dplyr::left_join(by = c("taxon_name"), 
+                     database$taxa %>% dplyr::select(dplyr::all_of(c("taxon_name", "genus"))))
 
   comparisons <- database$traits %>%
     dplyr::filter(trait_name == trait) %>%
     dplyr::filter(dataset_id != dataset) %>%
-    dplyr::left_join(by = c("taxon_name"), database$taxa %>% dplyr::select(dplyr::all_of(c("taxon_name", "genus")))) %>%
+    dplyr::left_join(by = c("taxon_name"), 
+                     database$taxa %>% dplyr::select(dplyr::all_of(c("taxon_name", "genus")))) %>%
     dplyr::filter(genus %in% to_compare$genus) %>%
     dplyr::select(dplyr::all_of(c("genus", "trait_name", "value"))) %>%
     dplyr::group_by(genus) %>%
@@ -207,11 +221,13 @@ dataset_check_outlier_by_genus <- function(database, dataset, trait, multiplier)
   need_review <- to_compare %>%
     dplyr::filter(trait_name == trait) %>%
     dplyr::filter(genus %in% comparisons$genus) %>%
-    dplyr::select(dplyr::all_of(c("taxon_name", "trait_name", "value", "genus", "observation_id", "unit", "original_name"))) %>%
+    dplyr::select(dplyr::all_of(c("taxon_name", "trait_name", "value", "genus", 
+                                  "observation_id", "unit", "original_name"))) %>%
     dplyr::left_join(
       by = c("genus", "trait_name"),
       comparisons) %>%
-    dplyr::filter(as.numeric(value) > multiplier*mean_value | as.numeric(value) < (1/multiplier)*mean_value) %>%
+    dplyr::filter(as.numeric(value) > multiplier*mean_value | 
+                  as.numeric(value) < (1/multiplier)*mean_value) %>%
     dplyr::mutate(value_ratio = as.numeric(value)/mean_value) %>%
     dplyr::arrange(dplyr::desc(value_ratio))