This notebook looks for common sources of error and flags those records manual revision by the assessors who capture data from each country.
The output is a file showing the records that need manual review or corrections, if any
Load required libraries:
library(tidyr)
library(dplyr)
library(utile.tools)
library(stringr)
library(ggplot2)
Get Kobo raw output data:
kobo_output<-read.csv(file="International_Genetic_Indicator_testing_V_4.0_-_latest_version_-_False_-_2023-11-02-08-23-26.csv", sep=";", header=TRUE) %>%
## add taxon column
mutate(taxon=(utile.tools::paste(genus, species, subspecies_variety, na.rm=TRUE))) %>%
# remove white space at the end of the name
mutate(taxon=str_trim(taxon, "right"))
Filter out records which were marked as “not_approved” in the manual Kobo validation interface (this means country assessors determined the is something wrong with that particular record).
# check if any species is flagged as "validation_status_not_approved"
kobo_output %>%
filter(X_validation_status=="validation_status_not_approved") %>%
select(country_assessment, name_assessor, taxon)
# omit those records from data:
kobo_output<- kobo_output %>%
filter(X_validation_status!="validation_status_not_approved")
# select likely columns to say "test"
cols= c("name_assessor", "email_assessor", "genus", "species", "subspecies_variety",
"scientific_authority", "common_name", "GBIF_taxonID", "NCBI_taxonID", "time_populations")
# check for "test" or "template" on any of them
kobo_output %>%
filter(if_any(all_of(cols), ~ grepl("test", .)) |
if_any(all_of(cols), ~ grepl("Template", .))) %>%
select(country_assessment, name_assessor, genus, species)
# filter them out of dataset
kobo_output<- kobo_output %>%
filter(if_any(all_of(cols), ~ !grepl("test", .))) %>%
filter(genus!="Template")
In the form, -999 was used to mark taxa with unknown number of extant populations. This was used because answering the question was mandatory, so leaving it blank wasn’t possible. We have to change -999 to NA.
kobo_output<-kobo_output %>%
mutate(n_extant_populations= na_if(n_extant_populations, -999))
We can now explore how many populations per species are still extant (still existing! NOT extinct!)?
summary(kobo_output$n_extant_populations)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 1.00 3.00 63.28 8.00 47364.00 19
table(kobo_output$n_extant_populations)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 4 303 134 86 76 41 36 15 23 14 21 13 9
## 13 14 15 16 17 18 19 20 21 22 23 24 25
## 13 6 15 7 2 11 11 4 3 3 2 5 2
## 26 27 28 30 33 34 35 36 37 38 40 42 43
## 2 1 2 9 2 1 3 3 3 2 1 2 1
## 45 47 48 50 53 54 55 57 60 65 68 69 72
## 3 1 1 1 1 1 2 1 1 3 1 1 1
## 74 75 77 79 80 85 87 89 91 95 100 104 105
## 1 1 2 1 1 1 1 1 1 1 1 1 2
## 113 116 122 124 127 137 146 148 150 173 176 180 186
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 200 241 242 284 286 311 330 345 347 840 47364
## 1 1 1 1 1 1 1 1 1 1 1
Plot histogram
ggplot(kobo_output, aes(x=n_extant_populations))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (`stat_bin()`).
Zoom Plot histogram
kobo_output %>%
filter(n_extant_populations>=0, n_extant_populations<25) %>%
ggplot(., aes(x=n_extant_populations))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Once -999 was replaced by NA there should be no negative number of populations (if they are, they are typos that need to be corrected).
kobo_output %>%
filter(n_extant_populations<0) %>%
select(country_assessment, taxon, name_assessor, n_extant_populations, n_extint_populations)
Show which taxa (if any) have 0 (zero) extant populations. Is this correct? needs to be manually checked
kobo_output %>%
filter(n_extant_populations==0) %>%
select(country_assessment, taxon, name_assessor, n_extant_populations, n_extint_populations)
Show which species (if any) have 999 extant populations. Should this be -999? OR n_extinct pops??
kobo_output %>%
filter(n_extant_populations==999) %>%
select(country_assessment, taxon, name_assessor, n_extant_populations, n_extint_populations)
Show which species (if any) have 999 EXTINCT populations. Should this be -999?
kobo_output %>%
filter(n_extint_populations==999) %>%
select(country_assessment, taxon, name_assessor, n_extant_populations, n_extint_populations)
Put all taxa with weird number of populations that need to be checked together:
check_n_pops <- kobo_output %>%
# variables of interest
select(country_assessment, name_assessor, taxon, n_extant_populations, n_extint_populations) %>%
# same filters that discussed above
filter(n_extant_populations<0 |
n_extant_populations==0 |
n_extant_populations==999 |
n_extint_populations==999) %>%
# add a column stating what needs to be checked:
mutate(need_to_check="check number of extant or extint populations. Are 0 correct? should 999 be -999? are extant/extint confused?")
Check GBIF
# check IDs
head(kobo_output$GBIF_taxonID)
## [1] "2440940" "2337607" "2477927" "" "2422522" "5219516"
GBIF IDs tend to be 7 characters long. Some can be larger or shorter, but these seem to be exceptions. Therefore let’s flag any records where the GBIF Id is =/= 7 to manually check if it is correct.
kobo_output %>%
filter(nchar(GBIF_taxonID)>0, nchar(GBIF_taxonID)!=7) %>%
# show only relevant columns
select(country_assessment, name_assessor, taxon, GBIF_taxonID)
Put them in their own happy df with a column stating what is the likely problem:
check_GBIF <- kobo_output %>%
filter(nchar(GBIF_taxonID)>0, nchar(GBIF_taxonID)!=7) %>%
# show only relevant columns
select(country_assessment, name_assessor, taxon, GBIF_taxonID) %>%
# add a column stating what needs to be checked:
mutate(need_to_check="check the GBIF taxonID. Either it looks plain different, or has more or less than 7 digits (most ids are 7 digits long, and this isn't, it could be an exception, or a mistake).")
Genus, species and subspecies should be a single word, check if there are cases where it isn’t. Only exception would be “var.” or “subsp.” in the subspecies_variety field:
kobo_output %>%
filter(grepl(" ", genus) |
grepl(" ", species) |
grepl(" ", subspecies_variety)) %>%
filter(!grepl("var.", subspecies_variety)) %>%
filter(!grepl("subsp.", subspecies_variety)) %>%
# show only relevant columns
# show only relevant columns
select(country_assessment, name_assessor, taxon, genus, species, subspecies_variety)
Put them in their own happy df with a column stating what is the likely problem:
check_taxon_names <- kobo_output %>%
filter(grepl(" ", genus) |
grepl(" ", species) |
grepl(" ", subspecies_variety)) %>%
filter(!grepl("var.", subspecies_variety)) %>%
filter(!grepl("subsp.", subspecies_variety)) %>%
# show only relevant columns
select(country_assessment, name_assessor, taxon, genus, species, subspecies_variety) %>%
mutate(need_to_check="check genus, species or subspecies_variety, we are targeting to have single words in each field, except in the ifraspecific names, where 'var.' and 'subsp.' (only) would be accepted. Other details or taxonomic notes should be added in the comments.")
to_check<-full_join(check_n_pops, check_GBIF) %>% full_join(check_taxon_names) %>%
# show columns in desired order:
select(country_assessment, name_assessor, taxon, need_to_check, n_extant_populations,
n_extint_populations, GBIF_taxonID, genus, species, subspecies_variety)
## Joining, by = c("country_assessment", "name_assessor", "taxon",
## "need_to_check")
## Joining, by = c("country_assessment", "name_assessor", "taxon",
## "need_to_check")
# save file:
write.csv(to_check, "kobo_output_tocheck_30april.csv", row.names = FALSE, fileEncoding = "UTF-8")
Session Info for reproducibility purposes:
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_3.4.1 stringr_1.4.0 utile.tools_0.2.7 dplyr_1.0.9
## [5] tidyr_1.2.0
##
## loaded via a namespace (and not attached):
## [1] highr_0.9 pillar_1.7.0 bslib_0.3.1 compiler_4.2.1
## [5] jquerylib_0.1.4 tools_4.2.1 digest_0.6.29 gtable_0.3.0
## [9] jsonlite_1.8.0 evaluate_0.15 lifecycle_1.0.3 tibble_3.1.7
## [13] pkgconfig_2.0.3 rlang_1.0.6 cli_3.6.0 DBI_1.1.3
## [17] rstudioapi_0.13 yaml_2.3.5 xfun_0.31 fastmap_1.1.0
## [21] withr_2.5.0 knitr_1.39 generics_0.1.3 vctrs_0.5.2
## [25] sass_0.4.1 grid_4.2.1 tidyselect_1.1.2 glue_1.6.2
## [29] R6_2.5.1 fansi_1.0.3 rmarkdown_2.14 farver_2.1.1
## [33] purrr_0.3.4 magrittr_2.0.3 scales_1.2.0 ellipsis_0.3.2
## [37] htmltools_0.5.5 assertthat_0.2.1 colorspace_2.0-3 labeling_0.4.2
## [41] utf8_1.2.2 stringi_1.7.6 munsell_0.5.0 crayon_1.5.1